From 5037a00992e5fcb3d8509964313565a3dab6697c Mon Sep 17 00:00:00 2001
From: Sunil Dutt <usdutt@qti.qualcomm.com>
Date: Thu, 25 Jan 2018 17:13:37 +0200
Subject: nl80211: Introduce scan flags to emphasize requested scan behavior

This commit defines new scan flags (LOW_SPAN, LOW_POWER, HIGH_LATENCY)
to emphasize the requested scan behavior for the driver. These flags
are optional and are mutually exclusive. The implementation of the
respective functionality can be driver/hardware specific.

These flags can be used to control the compromise between how long
a scan takes, how much power it uses, and high accurate/complete
the scan is in finding the BSSs.

Signed-off-by: Sunil Dutt <usdutt@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ab0c687d0c44..dd249ec9f228 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -6715,8 +6715,17 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev,
 
 	*flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]);
 
-	if ((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
-	    !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN))
+	if (((*flags & NL80211_SCAN_FLAG_LOW_PRIORITY) &&
+	     !(wiphy->features & NL80211_FEATURE_LOW_PRIORITY_SCAN)) ||
+	    ((*flags & NL80211_SCAN_FLAG_LOW_SPAN) &&
+	     !wiphy_ext_feature_isset(wiphy,
+				      NL80211_EXT_FEATURE_LOW_SPAN_SCAN)) ||
+	    ((*flags & NL80211_SCAN_FLAG_LOW_POWER) &&
+	     !wiphy_ext_feature_isset(wiphy,
+				      NL80211_EXT_FEATURE_LOW_POWER_SCAN)) ||
+	    ((*flags & NL80211_SCAN_FLAG_HIGH_ACCURACY) &&
+	     !wiphy_ext_feature_isset(wiphy,
+				      NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN)))
 		return -EOPNOTSUPP;
 
 	if (*flags & NL80211_SCAN_FLAG_RANDOM_ADDR) {
-- 
cgit v1.2.3


From 40cbfa90218bc570a7959b436b9d48a18c361041 Mon Sep 17 00:00:00 2001
From: Srinivas Dasari <dasaris@qti.qualcomm.com>
Date: Thu, 25 Jan 2018 17:13:38 +0200
Subject: cfg80211/nl80211: Optional authentication offload to userspace

This interface allows the host driver to offload the authentication to
user space. This is exclusively defined for host drivers that do not
define separate commands for authentication and association, but rely on
userspace SME (e.g., in wpa_supplicant for the ~WPA_DRIVER_FLAGS_SME
case) for the authentication to happen. This can be used to implement
SAE without full implementation in the kernel/firmware while still being
able to use NL80211_CMD_CONNECT with driver-based BSS selection.

Host driver sends NL80211_CMD_EXTERNAL_AUTH event to start/abort
authentication to the port on which connect is triggered and status
of authentication is further indicated by user space to host
driver through the same command response interface.

User space entities advertise this capability through the
NL80211_ATTR_EXTERNAL_AUTH_SUPP flag in the NL80211_CMD_CONNECT request.
Host drivers shall look at this capability to offload the authentication.

Signed-off-by: Srinivas Dasari <dasaris@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
[add socket connection ownership check]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c  | 94 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/rdev-ops.h | 15 ++++++++
 net/wireless/trace.h    | 23 ++++++++++++
 3 files changed, 132 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index dd249ec9f228..aa6b64069c80 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -420,6 +420,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 },
 	[NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN },
 	[NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG },
+	[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -9161,6 +9162,15 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		return -EINVAL;
 	}
 
+	if (nla_get_flag(info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT])) {
+		if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+			GENL_SET_ERR_MSG(info,
+					 "external auth requires connection ownership");
+			return -EINVAL;
+		}
+		connect.flags |= CONNECT_REQ_EXTERNAL_AUTH_SUPPORT;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 
 	err = cfg80211_connect(rdev, dev, &connect, connkeys,
@@ -12469,6 +12479,41 @@ static int nl80211_del_pmk(struct sk_buff *skb, struct genl_info *info)
 	return ret;
 }
 
+static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct cfg80211_external_auth_params params;
+
+	if (rdev->ops->external_auth)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_SSID])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_BSSID])
+		return -EINVAL;
+
+	if (!info->attrs[NL80211_ATTR_STATUS_CODE])
+		return -EINVAL;
+
+	memset(&params, 0, sizeof(params));
+
+	params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]);
+	if (params.ssid.ssid_len == 0 ||
+	    params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN)
+		return -EINVAL;
+	memcpy(params.ssid.ssid, nla_data(info->attrs[NL80211_ATTR_SSID]),
+	       params.ssid.ssid_len);
+
+	memcpy(params.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]),
+	       ETH_ALEN);
+
+	params.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]);
+
+	return rdev_external_auth(rdev, dev, &params);
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -13364,6 +13409,14 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
+	{
+		.cmd = NL80211_CMD_EXTERNAL_AUTH,
+		.doit = nl80211_external_auth,
+		.policy = nl80211_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 
 };
 
@@ -15375,6 +15428,47 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev)
 	nlmsg_free(msg);
 }
 
+int cfg80211_external_auth_request(struct net_device *dev,
+				   struct cfg80211_external_auth_params *params,
+				   gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct sk_buff *msg;
+	void *hdr;
+
+	if (!wdev->conn_owner_nlportid)
+		return -EINVAL;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_EXTERNAL_AUTH);
+	if (!hdr)
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+	    nla_put_u32(msg, NL80211_ATTR_AKM_SUITES, params->key_mgmt_suite) ||
+	    nla_put_u32(msg, NL80211_ATTR_EXTERNAL_AUTH_ACTION,
+			params->action) ||
+	    nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, params->bssid) ||
+	    nla_put(msg, NL80211_ATTR_SSID, params->ssid.ssid_len,
+		    params->ssid.ssid))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	genlmsg_unicast(wiphy_net(&rdev->wiphy), msg,
+			wdev->conn_owner_nlportid);
+	return 0;
+
+ nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+EXPORT_SYMBOL(cfg80211_external_auth_request);
+
 /* initialisation/exit functions */
 
 int __init nl80211_init(void)
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 0c06240d25af..84f23ae015fc 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -1190,4 +1190,19 @@ static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev,
 	trace_rdev_return_int(&rdev->wiphy, ret);
 	return ret;
 }
+
+static inline int
+rdev_external_auth(struct cfg80211_registered_device *rdev,
+		   struct net_device *dev,
+		   struct cfg80211_external_auth_params *params)
+{
+	int ret = -EOPNOTSUPP;
+
+	trace_rdev_external_auth(&rdev->wiphy, dev, params);
+	if (rdev->ops->external_auth)
+		ret = rdev->ops->external_auth(&rdev->wiphy, dev, params);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 #endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index bcfedd39e7a3..5152938b358d 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2319,6 +2319,29 @@ TRACE_EVENT(rdev_del_pmk,
 		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(aa))
 );
 
+TRACE_EVENT(rdev_external_auth,
+	    TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		     struct cfg80211_external_auth_params *params),
+	    TP_ARGS(wiphy, netdev, params),
+	    TP_STRUCT__entry(WIPHY_ENTRY
+			     NETDEV_ENTRY
+			     MAC_ENTRY(bssid)
+			     __array(u8, ssid, IEEE80211_MAX_SSID_LEN + 1)
+			     __field(u16, status)
+	    ),
+	    TP_fast_assign(WIPHY_ASSIGN;
+			   NETDEV_ASSIGN;
+			   MAC_ASSIGN(bssid, params->bssid);
+			   memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1);
+			   memcpy(__entry->ssid, params->ssid.ssid,
+				  params->ssid.ssid_len);
+			   __entry->status = params->status;
+	    ),
+	    TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT
+		      ", ssid: %s, status: %u", WIPHY_PR_ARG, NETDEV_PR_ARG,
+		      __entry->bssid, __entry->ssid, __entry->status)
+);
+
 /*************************************************************
  *	     cfg80211 exported functions traces		     *
  *************************************************************/
-- 
cgit v1.2.3


From 10773a7c09b327d02144c7d181e6544b7015ffc7 Mon Sep 17 00:00:00 2001
From: Srinivas Dasari <dasaris@qti.qualcomm.com>
Date: Thu, 25 Jan 2018 17:13:39 +0200
Subject: nl80211: Allow SAE Authentication for NL80211_CMD_CONNECT

This commit allows SAE Authentication for NL80211_CMD_CONNECT
interface, provided host driver advertises the support.

Host drivers may offload the SAE authentication to user space
through NL80211_CMD_EXTERNAL_AUTH interface and thus expect
the user space to advertise support to handle offload through
NL80211_ATTR_EXTERNAL_AUTH_SUPPORT in NL80211_CMD_CONNECT
request. Such drivers should reject the connect request on no
offload support from user space.

Signed-off-by: Srinivas Dasari <dasaris@qti.qualcomm.com>
Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index aa6b64069c80..bdb70fe74e3c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3921,9 +3921,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
 			return false;
 		return true;
 	case NL80211_CMD_CONNECT:
-		/* SAE not supported yet */
-		if (auth_type == NL80211_AUTHTYPE_SAE)
+		if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
+		    auth_type == NL80211_AUTHTYPE_SAE)
 			return false;
+
 		/* FILS with SK PFS or PK not supported yet */
 		if (auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
 		    auth_type == NL80211_AUTHTYPE_FILS_PK)
-- 
cgit v1.2.3


From 25b0ba7ebbc5ffa5a64d752b85af78e6e517e3b2 Mon Sep 17 00:00:00 2001
From: Ben Greear <greearb@candelatech.com>
Date: Fri, 26 Jan 2018 17:14:19 -0800
Subject: mac80211: Add txq flags to debugfs

Might help one figure out why aqm drivers may fail to transmit
frames sometimes.

Signed-off-by: Ben Greear <greearb@candelatech.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs_sta.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 444ea8d127fe..4105081dc1df 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -160,12 +160,12 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
 		       sta->cparams.ecn ? "yes" : "no");
 	p += scnprintf(p,
 		       bufsz+buf-p,
-		       "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets\n");
+		       "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n");
 
 	for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
 		txqi = to_txq_info(sta->sta.txq[i]);
 		p += scnprintf(p, bufsz+buf-p,
-			       "%d %d %u %u %u %u %u %u %u %u %u\n",
+			       "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n",
 			       txqi->txq.tid,
 			       txqi->txq.ac,
 			       txqi->tin.backlog_bytes,
@@ -176,7 +176,11 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
 			       txqi->tin.overlimit,
 			       txqi->tin.collisions,
 			       txqi->tin.tx_bytes,
-			       txqi->tin.tx_packets);
+			       txqi->tin.tx_packets,
+			       txqi->flags,
+			       txqi->flags & (1<<IEEE80211_TXQ_STOP) ? "STOP" : "RUN",
+			       txqi->flags & (1<<IEEE80211_TXQ_AMPDU) ? " AMPDU" : "",
+			       txqi->flags & (1<<IEEE80211_TXQ_NO_AMSDU) ? " NO-AMSDU" : "");
 	}
 
 	rcu_read_unlock();
-- 
cgit v1.2.3


From 466b9936bf93b7ec3bce1dcd493262ff0a8a4f44 Mon Sep 17 00:00:00 2001
From: "tamizhr@codeaurora.org" <tamizhr@codeaurora.org>
Date: Wed, 31 Jan 2018 16:24:49 +0530
Subject: cfg80211: Add support to notify station's opmode change to userspace

ht/vht action frames will be sent to AP from station to notify
change of its ht/vht opmode(max bandwidth, smps mode or nss) modified
values. Currently these valuse used by driver/firmware for rate control
algorithm. This patch introduces NL80211_CMD_STA_OPMODE_CHANGED
command to notify those modified/current supported values(max bandwidth,
smps mode, max nss) to userspace application. This will be useful for the
application like steering, which closely monitoring station's capability
changes. Since the application has taken these values during station
association.

Signed-off-by: Tamizh chelvam <tamizhr@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index bdb70fe74e3c..cc6ec5bab676 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14950,6 +14950,61 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 	nlmsg_free(msg);
 }
 
+void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac,
+				       struct sta_opmode_info *sta_opmode,
+				       gfp_t gfp)
+{
+	struct sk_buff *msg;
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	void *hdr;
+
+	if (WARN_ON(!mac))
+		return;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+	if (!msg)
+		return;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_STA_OPMODE_CHANGED);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx))
+		goto nla_put_failure;
+
+	if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
+		goto nla_put_failure;
+
+	if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, mac))
+		goto nla_put_failure;
+
+	if ((sta_opmode->changed & STA_OPMODE_SMPS_MODE_CHANGED) &&
+	    nla_put_u8(msg, NL80211_ATTR_SMPS_MODE, sta_opmode->smps_mode))
+		goto nla_put_failure;
+
+	if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) &&
+	    nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw))
+		goto nla_put_failure;
+
+	if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) &&
+	    nla_put_u8(msg, NL80211_ATTR_NSS, sta_opmode->rx_nss))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+				NL80211_MCGRP_MLME, gfp);
+
+	return;
+
+nla_put_failure:
+	nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify);
+
 void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
 			   u64 cookie, bool acked, gfp_t gfp)
 {
-- 
cgit v1.2.3


From ff84e7bfe1763982e1ada390386dbe4739cee1e2 Mon Sep 17 00:00:00 2001
From: "tamizhr@codeaurora.org" <tamizhr@codeaurora.org>
Date: Wed, 31 Jan 2018 16:24:50 +0530
Subject: mac80211: Add support to notify ht/vht opmode modification.

This will add support to send an event to a userspace application
whenever station advertise its ht/vht opmode modification through
an action frame.

Signed-off-by: Tamizh chelvam <tamizhr@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c  | 14 ++++++++++++++
 net/mac80211/vht.c |  9 +++++++++
 2 files changed, 23 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index fd580614085b..e755f93ad735 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2848,6 +2848,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 		case WLAN_HT_ACTION_SMPS: {
 			struct ieee80211_supported_band *sband;
 			enum ieee80211_smps_mode smps_mode;
+			struct sta_opmode_info sta_opmode = {};
 
 			/* convert to HT capability */
 			switch (mgmt->u.action.u.ht_smps.smps_control) {
@@ -2868,17 +2869,24 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			if (rx->sta->sta.smps_mode == smps_mode)
 				goto handled;
 			rx->sta->sta.smps_mode = smps_mode;
+			sta_opmode.smps_mode = smps_mode;
+			sta_opmode.changed = STA_OPMODE_SMPS_MODE_CHANGED;
 
 			sband = rx->local->hw.wiphy->bands[status->band];
 
 			rate_control_rate_update(local, sband, rx->sta,
 						 IEEE80211_RC_SMPS_CHANGED);
+			cfg80211_sta_opmode_change_notify(sdata->dev,
+							  rx->sta->addr,
+							  &sta_opmode,
+							  GFP_KERNEL);
 			goto handled;
 		}
 		case WLAN_HT_ACTION_NOTIFY_CHANWIDTH: {
 			struct ieee80211_supported_band *sband;
 			u8 chanwidth = mgmt->u.action.u.ht_notify_cw.chanwidth;
 			enum ieee80211_sta_rx_bandwidth max_bw, new_bw;
+			struct sta_opmode_info sta_opmode = {};
 
 			/* If it doesn't support 40 MHz it can't change ... */
 			if (!(rx->sta->sta.ht_cap.cap &
@@ -2899,9 +2907,15 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 
 			rx->sta->sta.bandwidth = new_bw;
 			sband = rx->local->hw.wiphy->bands[status->band];
+			sta_opmode.bw = new_bw;
+			sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED;
 
 			rate_control_rate_update(local, sband, rx->sta,
 						 IEEE80211_RC_BW_CHANGED);
+			cfg80211_sta_opmode_change_notify(sdata->dev,
+							  rx->sta->addr,
+							  &sta_opmode,
+							  GFP_KERNEL);
 			goto handled;
 		}
 		default:
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index b9276ac849fa..5714dee76b12 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -447,6 +447,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 				  enum nl80211_band band)
 {
 	enum ieee80211_sta_rx_bandwidth new_bw;
+	struct sta_opmode_info sta_opmode = {};
 	u32 changed = 0;
 	u8 nss;
 
@@ -460,7 +461,9 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 
 	if (sta->sta.rx_nss != nss) {
 		sta->sta.rx_nss = nss;
+		sta_opmode.rx_nss = nss;
 		changed |= IEEE80211_RC_NSS_CHANGED;
+		sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED;
 	}
 
 	switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
@@ -481,9 +484,15 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 	new_bw = ieee80211_sta_cur_vht_bw(sta);
 	if (new_bw != sta->sta.bandwidth) {
 		sta->sta.bandwidth = new_bw;
+		sta_opmode.bw = new_bw;
 		changed |= IEEE80211_RC_BW_CHANGED;
+		sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED;
 	}
 
+	if (sta_opmode.changed)
+		cfg80211_sta_opmode_change_notify(sdata->dev, sta->addr,
+						  &sta_opmode, GFP_KERNEL);
+
 	return changed;
 }
 
-- 
cgit v1.2.3


From 62ebdc25c4002e5fc104ab536ce7d99ba76be03f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=81ukasz=20Rymanowski?= <lukasz.rymanowski@codecoup.pl>
Date: Fri, 9 Feb 2018 18:26:02 +0100
Subject: Bluetooth: Fix incorrect bits for LE states
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes incorrect checks for LE states.
Issues found when doing mgmt tests for scenario
when Linux Kernel should do connectable advertising
while connected.

Signed-off-by: Łukasz Rymanowski <lukasz.rymanowski@codecoup.pl>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 3394e6791673..66c0781773df 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -934,8 +934,8 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
 		/* Slave connection state and connectable mode bit 38
 		 * and scannable bit 21.
 		 */
-		if (connectable && (!(hdev->le_states[4] & 0x01) ||
-				    !(hdev->le_states[2] & 0x40)))
+		if (connectable && (!(hdev->le_states[4] & 0x40) ||
+				    !(hdev->le_states[2] & 0x20)))
 			return false;
 	}
 
@@ -948,7 +948,7 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
 		/* Master connection state and connectable mode bit 35 and
 		 * scannable 19.
 		 */
-		if (connectable && (!(hdev->le_states[4] & 0x10) ||
+		if (connectable && (!(hdev->le_states[4] & 0x08) ||
 				    !(hdev->le_states[2] & 0x08)))
 			return false;
 	}
-- 
cgit v1.2.3


From 9b2c45d479d0fb8647c9e83359df69162b5fbe5f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Mon, 12 Feb 2018 20:00:20 +0100
Subject: net: make getname() functions return length rather than use int*
 parameter

Changes since v1:
Added changes in these files:
    drivers/infiniband/hw/usnic/usnic_transport.c
    drivers/staging/lustre/lnet/lnet/lib-socket.c
    drivers/target/iscsi/iscsi_target_login.c
    drivers/vhost/net.c
    fs/dlm/lowcomms.c
    fs/ocfs2/cluster/tcp.c
    security/tomoyo/network.c

Before:
All these functions either return a negative error indicator,
or store length of sockaddr into "int *socklen" parameter
and return zero on success.

"int *socklen" parameter is awkward. For example, if caller does not
care, it still needs to provide on-stack storage for the value
it does not need.

None of the many FOO_getname() functions of various protocols
ever used old value of *socklen. They always just overwrite it.

This change drops this parameter, and makes all these functions, on success,
return length of sockaddr. It's always >= 0 and can be differentiated
from an error.

Tests in callers are changed from "if (err)" to "if (err < 0)", where needed.

rpc_sockname() lost "int buflen" parameter, since its only use was
to be passed to kernel_getsockname() as &buflen and subsequently
not used in any way.

Userspace API is not changed.

    text    data     bss      dec     hex filename
30108430 2633624  873672 33615726 200ef6e vmlinux.before.o
30108109 2633612  873672 33615393 200ee21 vmlinux.o

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: David S. Miller <davem@davemloft.net>
CC: linux-kernel@vger.kernel.org
CC: netdev@vger.kernel.org
CC: linux-bluetooth@vger.kernel.org
CC: linux-decnet-user@lists.sourceforge.net
CC: linux-wireless@vger.kernel.org
CC: linux-rdma@vger.kernel.org
CC: linux-sctp@vger.kernel.org
CC: linux-nfs@vger.kernel.org
CC: linux-x25@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/appletalk/ddp.c         |  5 ++---
 net/atm/pvc.c               |  5 ++---
 net/atm/svc.c               |  5 ++---
 net/ax25/af_ax25.c          |  4 ++--
 net/bluetooth/hci_sock.c    |  4 ++--
 net/bluetooth/l2cap_sock.c  |  5 ++---
 net/bluetooth/rfcomm/sock.c |  5 ++---
 net/bluetooth/sco.c         |  5 ++---
 net/can/raw.c               |  6 ++----
 net/core/sock.c             |  5 +++--
 net/decnet/af_decnet.c      |  6 ++----
 net/ipv4/af_inet.c          |  5 ++---
 net/ipv6/af_inet6.c         |  5 ++---
 net/iucv/af_iucv.c          |  5 ++---
 net/l2tp/l2tp_ip.c          |  5 ++---
 net/l2tp/l2tp_ip6.c         |  5 ++---
 net/l2tp/l2tp_ppp.c         |  5 ++---
 net/llc/af_llc.c            |  5 ++---
 net/netlink/af_netlink.c    |  5 ++---
 net/netrom/af_netrom.c      |  9 +++++----
 net/nfc/llcp_sock.c         |  5 ++---
 net/packet/af_packet.c      | 10 ++++------
 net/phonet/socket.c         |  5 ++---
 net/qrtr/qrtr.c             |  5 ++---
 net/rds/af_rds.c            |  5 ++---
 net/rds/tcp.c               |  7 ++-----
 net/rose/af_rose.c          |  5 ++---
 net/sctp/ipv6.c             |  8 ++++----
 net/smc/af_smc.c            | 11 +++++------
 net/socket.c                | 35 +++++++++++++++++------------------
 net/sunrpc/clnt.c           |  6 +++---
 net/sunrpc/svcsock.c        | 13 ++++++++-----
 net/sunrpc/xprtsock.c       |  3 +--
 net/tipc/socket.c           |  5 ++---
 net/unix/af_unix.c          | 10 +++++-----
 net/vmw_vsock/af_vsock.c    |  4 ++--
 net/x25/af_x25.c            |  4 ++--
 37 files changed, 109 insertions(+), 136 deletions(-)

(limited to 'net')

diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 03a9fc0771c0..9b6bc5abe946 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1238,7 +1238,7 @@ out:
  * fields into the sockaddr.
  */
 static int atalk_getname(struct socket *sock, struct sockaddr *uaddr,
-			 int *uaddr_len, int peer)
+			 int peer)
 {
 	struct sockaddr_at sat;
 	struct sock *sk = sock->sk;
@@ -1251,7 +1251,6 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr,
 		if (atalk_autobind(sk) < 0)
 			goto out;
 
-	*uaddr_len = sizeof(struct sockaddr_at);
 	memset(&sat, 0, sizeof(sat));
 
 	if (peer) {
@@ -1268,9 +1267,9 @@ static int atalk_getname(struct socket *sock, struct sockaddr *uaddr,
 		sat.sat_port	    = at->src_port;
 	}
 
-	err = 0;
 	sat.sat_family = AF_APPLETALK;
 	memcpy(uaddr, &sat, sizeof(sat));
+	err = sizeof(struct sockaddr_at);
 
 out:
 	release_sock(sk);
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index e1140b3bdcaa..2cb10af16afc 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -87,21 +87,20 @@ static int pvc_getsockopt(struct socket *sock, int level, int optname,
 }
 
 static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr,
-		       int *sockaddr_len, int peer)
+		       int peer)
 {
 	struct sockaddr_atmpvc *addr;
 	struct atm_vcc *vcc = ATM_SD(sock);
 
 	if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
 		return -ENOTCONN;
-	*sockaddr_len = sizeof(struct sockaddr_atmpvc);
 	addr = (struct sockaddr_atmpvc *)sockaddr;
 	memset(addr, 0, sizeof(*addr));
 	addr->sap_family = AF_ATMPVC;
 	addr->sap_addr.itf = vcc->dev->number;
 	addr->sap_addr.vpi = vcc->vpi;
 	addr->sap_addr.vci = vcc->vci;
-	return 0;
+	return sizeof(struct sockaddr_atmpvc);
 }
 
 static const struct proto_ops pvc_proto_ops = {
diff --git a/net/atm/svc.c b/net/atm/svc.c
index c458adcbc177..2f91b766ac42 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -419,15 +419,14 @@ out:
 }
 
 static int svc_getname(struct socket *sock, struct sockaddr *sockaddr,
-		       int *sockaddr_len, int peer)
+		       int peer)
 {
 	struct sockaddr_atmsvc *addr;
 
-	*sockaddr_len = sizeof(struct sockaddr_atmsvc);
 	addr = (struct sockaddr_atmsvc *) sockaddr;
 	memcpy(addr, peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local,
 	       sizeof(struct sockaddr_atmsvc));
-	return 0;
+	return sizeof(struct sockaddr_atmsvc);
 }
 
 int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 47fdd399626b..c8319ed48485 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1388,7 +1388,7 @@ out:
 }
 
 static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
-	int *uaddr_len, int peer)
+	int peer)
 {
 	struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr;
 	struct sock *sk = sock->sk;
@@ -1427,7 +1427,7 @@ static int ax25_getname(struct socket *sock, struct sockaddr *uaddr,
 			fsa->fsa_digipeater[0] = null_ax25_address;
 		}
 	}
-	*uaddr_len = sizeof (struct full_sockaddr_ax25);
+	err = sizeof (struct full_sockaddr_ax25);
 
 out:
 	release_sock(sk);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 923e9a271872..1506e1632394 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1340,7 +1340,7 @@ done:
 }
 
 static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
-			    int *addr_len, int peer)
+			    int peer)
 {
 	struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr;
 	struct sock *sk = sock->sk;
@@ -1360,10 +1360,10 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
 		goto done;
 	}
 
-	*addr_len = sizeof(*haddr);
 	haddr->hci_family = AF_BLUETOOTH;
 	haddr->hci_dev    = hdev->id;
 	haddr->hci_channel= hci_pi(sk)->channel;
+	err = sizeof(*haddr);
 
 done:
 	release_sock(sk);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 67a8642f57ea..686bdc6b35b0 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -358,7 +358,7 @@ done:
 }
 
 static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr,
-			      int *len, int peer)
+			      int peer)
 {
 	struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
 	struct sock *sk = sock->sk;
@@ -373,7 +373,6 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr,
 
 	memset(la, 0, sizeof(struct sockaddr_l2));
 	addr->sa_family = AF_BLUETOOTH;
-	*len = sizeof(struct sockaddr_l2);
 
 	la->l2_psm = chan->psm;
 
@@ -387,7 +386,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr,
 		la->l2_bdaddr_type = chan->src_type;
 	}
 
-	return 0;
+	return sizeof(struct sockaddr_l2);
 }
 
 static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 1aaccf637479..93a3b219db09 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -533,7 +533,7 @@ done:
 	return err;
 }
 
-static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer)
+static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int peer)
 {
 	struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
 	struct sock *sk = sock->sk;
@@ -552,8 +552,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *
 	else
 		bacpy(&sa->rc_bdaddr, &rfcomm_pi(sk)->src);
 
-	*len = sizeof(struct sockaddr_rc);
-	return 0;
+	return sizeof(struct sockaddr_rc);
 }
 
 static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg,
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 08df57665e1f..413b8ee49fec 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -680,7 +680,7 @@ done:
 }
 
 static int sco_sock_getname(struct socket *sock, struct sockaddr *addr,
-			    int *len, int peer)
+			    int peer)
 {
 	struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
 	struct sock *sk = sock->sk;
@@ -688,14 +688,13 @@ static int sco_sock_getname(struct socket *sock, struct sockaddr *addr,
 	BT_DBG("sock %p, sk %p", sock, sk);
 
 	addr->sa_family = AF_BLUETOOTH;
-	*len = sizeof(struct sockaddr_sco);
 
 	if (peer)
 		bacpy(&sa->sco_bdaddr, &sco_pi(sk)->dst);
 	else
 		bacpy(&sa->sco_bdaddr, &sco_pi(sk)->src);
 
-	return 0;
+	return sizeof(struct sockaddr_sco);
 }
 
 static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg,
diff --git a/net/can/raw.c b/net/can/raw.c
index f2ecc43376a1..1051eee82581 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -470,7 +470,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
 }
 
 static int raw_getname(struct socket *sock, struct sockaddr *uaddr,
-		       int *len, int peer)
+		       int peer)
 {
 	struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
 	struct sock *sk = sock->sk;
@@ -483,9 +483,7 @@ static int raw_getname(struct socket *sock, struct sockaddr *uaddr,
 	addr->can_family  = AF_CAN;
 	addr->can_ifindex = ro->ifindex;
 
-	*len = sizeof(*addr);
-
-	return 0;
+	return sizeof(*addr);
 }
 
 static int raw_setsockopt(struct socket *sock, int level, int optname,
diff --git a/net/core/sock.c b/net/core/sock.c
index c501499a04fe..04e5e27c9b81 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1274,7 +1274,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 	{
 		char address[128];
 
-		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
+		if (lv < 0)
 			return -ENOTCONN;
 		if (lv < len)
 			return -EINVAL;
@@ -2497,7 +2498,7 @@ int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
 EXPORT_SYMBOL(sock_no_accept);
 
 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
-		    int *len, int peer)
+		    int peer)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 91dd09f79808..45cb5bea884b 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1180,14 +1180,12 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
 }
 
 
-static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer)
+static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
 {
 	struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr;
 	struct sock *sk = sock->sk;
 	struct dn_scp *scp = DN_SK(sk);
 
-	*uaddr_len = sizeof(struct sockaddr_dn);
-
 	lock_sock(sk);
 
 	if (peer) {
@@ -1205,7 +1203,7 @@ static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len
 
 	release_sock(sk);
 
-	return 0;
+	return sizeof(struct sockaddr_dn);
 }
 
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e4329e161943..f98e2f0db841 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -723,7 +723,7 @@ EXPORT_SYMBOL(inet_accept);
  *	This does both peername and sockname.
  */
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
-			int *uaddr_len, int peer)
+			int peer)
 {
 	struct sock *sk		= sock->sk;
 	struct inet_sock *inet	= inet_sk(sk);
@@ -745,8 +745,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 		sin->sin_addr.s_addr = addr;
 	}
 	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
-	*uaddr_len = sizeof(*sin);
-	return 0;
+	return sizeof(*sin);
 }
 EXPORT_SYMBOL(inet_getname);
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 416917719a6f..c1e292db04db 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -470,7 +470,7 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock);
  */
 
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
-		 int *uaddr_len, int peer)
+		 int peer)
 {
 	struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
 	struct sock *sk = sock->sk;
@@ -500,8 +500,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 	}
 	sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
 						 sk->sk_bound_dev_if);
-	*uaddr_len = sizeof(*sin);
-	return 0;
+	return sizeof(*sin);
 }
 EXPORT_SYMBOL(inet6_getname);
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 1e8cc7bcbca3..81ce15ffb878 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -989,14 +989,13 @@ done:
 }
 
 static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr,
-			     int *len, int peer)
+			     int peer)
 {
 	struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr;
 	struct sock *sk = sock->sk;
 	struct iucv_sock *iucv = iucv_sk(sk);
 
 	addr->sa_family = AF_IUCV;
-	*len = sizeof(struct sockaddr_iucv);
 
 	if (peer) {
 		memcpy(siucv->siucv_user_id, iucv->dst_user_id, 8);
@@ -1009,7 +1008,7 @@ static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr,
 	memset(&siucv->siucv_addr, 0, sizeof(siucv->siucv_addr));
 	memset(&siucv->siucv_nodeid, 0, sizeof(siucv->siucv_nodeid));
 
-	return 0;
+	return sizeof(struct sockaddr_iucv);
 }
 
 /**
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index ff61124fdf59..4614585e1720 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -349,7 +349,7 @@ static int l2tp_ip_disconnect(struct sock *sk, int flags)
 }
 
 static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
-			   int *uaddr_len, int peer)
+			   int peer)
 {
 	struct sock *sk		= sock->sk;
 	struct inet_sock *inet	= inet_sk(sk);
@@ -370,8 +370,7 @@ static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
 		lsa->l2tp_conn_id = lsk->conn_id;
 		lsa->l2tp_addr.s_addr = addr;
 	}
-	*uaddr_len = sizeof(*lsa);
-	return 0;
+	return sizeof(*lsa);
 }
 
 static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb)
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 192344688c06..efea58b66295 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -421,7 +421,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags)
 }
 
 static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
-			    int *uaddr_len, int peer)
+			    int peer)
 {
 	struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr;
 	struct sock *sk = sock->sk;
@@ -449,8 +449,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
 	}
 	if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
 		lsa->l2tp_scope_id = sk->sk_bound_dev_if;
-	*uaddr_len = sizeof(*lsa);
-	return 0;
+	return sizeof(*lsa);
 }
 
 static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb)
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 59f246d7b290..99a03c72db4f 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -870,7 +870,7 @@ err:
 /* getname() support.
  */
 static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
-			    int *usockaddr_len, int peer)
+			    int peer)
 {
 	int len = 0;
 	int error = 0;
@@ -969,8 +969,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
 		memcpy(uaddr, &sp, len);
 	}
 
-	*usockaddr_len = len;
-	error = 0;
+	error = len;
 
 	sock_put(sk);
 end:
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index c38d16f22d2a..01dcc0823d1f 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -971,7 +971,7 @@ release:
  *	Return the address information of a socket.
  */
 static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr,
-			  int *uaddrlen, int peer)
+			  int peer)
 {
 	struct sockaddr_llc sllc;
 	struct sock *sk = sock->sk;
@@ -982,7 +982,6 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr,
 	lock_sock(sk);
 	if (sock_flag(sk, SOCK_ZAPPED))
 		goto out;
-	*uaddrlen = sizeof(sllc);
 	if (peer) {
 		rc = -ENOTCONN;
 		if (sk->sk_state != TCP_ESTABLISHED)
@@ -1003,9 +1002,9 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr,
 			       IFHWADDRLEN);
 		}
 	}
-	rc = 0;
 	sllc.sllc_family = AF_LLC;
 	memcpy(uaddr, &sllc, sizeof(sllc));
+	rc = sizeof(sllc);
 out:
 	release_sock(sk);
 	return rc;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 2ad445c1d27c..3c8af14330b5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1105,7 +1105,7 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,
 }
 
 static int netlink_getname(struct socket *sock, struct sockaddr *addr,
-			   int *addr_len, int peer)
+			   int peer)
 {
 	struct sock *sk = sock->sk;
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -1113,7 +1113,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
 
 	nladdr->nl_family = AF_NETLINK;
 	nladdr->nl_pad = 0;
-	*addr_len = sizeof(*nladdr);
 
 	if (peer) {
 		nladdr->nl_pid = nlk->dst_portid;
@@ -1124,7 +1123,7 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
 		nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
 		netlink_unlock_table();
 	}
-	return 0;
+	return sizeof(*nladdr);
 }
 
 static int netlink_ioctl(struct socket *sock, unsigned int cmd,
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 9ba30c63be3d..35bb6807927f 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -829,11 +829,12 @@ out_release:
 }
 
 static int nr_getname(struct socket *sock, struct sockaddr *uaddr,
-	int *uaddr_len, int peer)
+	int peer)
 {
 	struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr;
 	struct sock *sk = sock->sk;
 	struct nr_sock *nr = nr_sk(sk);
+	int uaddr_len;
 
 	memset(&sax->fsa_ax25, 0, sizeof(struct sockaddr_ax25));
 
@@ -848,16 +849,16 @@ static int nr_getname(struct socket *sock, struct sockaddr *uaddr,
 		sax->fsa_ax25.sax25_call   = nr->user_addr;
 		memset(sax->fsa_digipeater, 0, sizeof(sax->fsa_digipeater));
 		sax->fsa_digipeater[0]     = nr->dest_addr;
-		*uaddr_len = sizeof(struct full_sockaddr_ax25);
+		uaddr_len = sizeof(struct full_sockaddr_ax25);
 	} else {
 		sax->fsa_ax25.sax25_family = AF_NETROM;
 		sax->fsa_ax25.sax25_ndigis = 0;
 		sax->fsa_ax25.sax25_call   = nr->source_addr;
-		*uaddr_len = sizeof(struct sockaddr_ax25);
+		uaddr_len = sizeof(struct sockaddr_ax25);
 	}
 	release_sock(sk);
 
-	return 0;
+	return uaddr_len;
 }
 
 int nr_rx_frame(struct sk_buff *skb, struct net_device *dev)
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 376040092142..ea0c0c6f1874 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -497,7 +497,7 @@ error:
 }
 
 static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
-			     int *len, int peer)
+			     int peer)
 {
 	struct sock *sk = sock->sk;
 	struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
@@ -510,7 +510,6 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
 		 llcp_sock->dsap, llcp_sock->ssap);
 
 	memset(llcp_addr, 0, sizeof(*llcp_addr));
-	*len = sizeof(struct sockaddr_nfc_llcp);
 
 	lock_sock(sk);
 	if (!llcp_sock->dev) {
@@ -528,7 +527,7 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
 	       llcp_addr->service_name_len);
 	release_sock(sk);
 
-	return 0;
+	return sizeof(struct sockaddr_nfc_llcp);
 }
 
 static inline __poll_t llcp_accept_poll(struct sock *parent)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e0f3f4aeeb4f..616cb9c18f88 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3409,7 +3409,7 @@ out:
 }
 
 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
-			       int *uaddr_len, int peer)
+			       int peer)
 {
 	struct net_device *dev;
 	struct sock *sk	= sock->sk;
@@ -3424,13 +3424,12 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
 	if (dev)
 		strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
 	rcu_read_unlock();
-	*uaddr_len = sizeof(*uaddr);
 
-	return 0;
+	return sizeof(*uaddr);
 }
 
 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
-			  int *uaddr_len, int peer)
+			  int peer)
 {
 	struct net_device *dev;
 	struct sock *sk = sock->sk;
@@ -3455,9 +3454,8 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
 		sll->sll_halen = 0;
 	}
 	rcu_read_unlock();
-	*uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
 
-	return 0;
+	return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
 }
 
 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index fffcd69f63ff..f9b40e6a18a5 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -326,7 +326,7 @@ static int pn_socket_accept(struct socket *sock, struct socket *newsock,
 }
 
 static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
-				int *sockaddr_len, int peer)
+				int peer)
 {
 	struct sock *sk = sock->sk;
 	struct pn_sock *pn = pn_sk(sk);
@@ -337,8 +337,7 @@ static int pn_socket_getname(struct socket *sock, struct sockaddr *addr,
 		pn_sockaddr_set_object((struct sockaddr_pn *)addr,
 					pn->sobject);
 
-	*sockaddr_len = sizeof(struct sockaddr_pn);
-	return 0;
+	return sizeof(struct sockaddr_pn);
 }
 
 static __poll_t pn_socket_poll(struct file *file, struct socket *sock,
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 5fb3929e3d7d..b33e5aeb4c06 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -893,7 +893,7 @@ static int qrtr_connect(struct socket *sock, struct sockaddr *saddr,
 }
 
 static int qrtr_getname(struct socket *sock, struct sockaddr *saddr,
-			int *len, int peer)
+			int peer)
 {
 	struct qrtr_sock *ipc = qrtr_sk(sock->sk);
 	struct sockaddr_qrtr qaddr;
@@ -912,12 +912,11 @@ static int qrtr_getname(struct socket *sock, struct sockaddr *saddr,
 	}
 	release_sock(sk);
 
-	*len = sizeof(qaddr);
 	qaddr.sq_family = AF_QIPCRTR;
 
 	memcpy(saddr, &qaddr, sizeof(qaddr));
 
-	return 0;
+	return sizeof(qaddr);
 }
 
 static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 744c637c86b0..0a8eefd256b3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -110,7 +110,7 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
 }
 
 static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
-		       int *uaddr_len, int peer)
+		       int peer)
 {
 	struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
 	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
@@ -131,8 +131,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
 
 	sin->sin_family = AF_INET;
 
-	*uaddr_len = sizeof(*sin);
-	return 0;
+	return sizeof(*sin);
 }
 
 /*
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 44c4652721af..08230a145042 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -227,7 +227,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 	struct rds_tcp_connection *tc;
 	unsigned long flags;
 	struct sockaddr_in sin;
-	int sinlen;
 	struct socket *sock;
 
 	spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
@@ -239,12 +238,10 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
 
 		sock = tc->t_sock;
 		if (sock) {
-			sock->ops->getname(sock, (struct sockaddr *)&sin,
-					   &sinlen, 0);
+			sock->ops->getname(sock, (struct sockaddr *)&sin, 0);
 			tsinfo.local_addr = sin.sin_addr.s_addr;
 			tsinfo.local_port = sin.sin_port;
-			sock->ops->getname(sock, (struct sockaddr *)&sin,
-					   &sinlen, 1);
+			sock->ops->getname(sock, (struct sockaddr *)&sin, 1);
 			tsinfo.peer_addr = sin.sin_addr.s_addr;
 			tsinfo.peer_port = sin.sin_port;
 		}
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 083bd251406f..5170373b797c 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -938,7 +938,7 @@ out_release:
 }
 
 static int rose_getname(struct socket *sock, struct sockaddr *uaddr,
-	int *uaddr_len, int peer)
+	int peer)
 {
 	struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr;
 	struct sock *sk = sock->sk;
@@ -964,8 +964,7 @@ static int rose_getname(struct socket *sock, struct sockaddr *uaddr,
 			srose->srose_digis[n] = rose->source_digis[n];
 	}
 
-	*uaddr_len = sizeof(struct full_sockaddr_rose);
-	return 0;
+	return sizeof(struct full_sockaddr_rose);
 }
 
 int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci)
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e35d4f73d2df..0d873c58e516 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -952,16 +952,16 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt,
 
 /* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */
 static int sctp_getname(struct socket *sock, struct sockaddr *uaddr,
-			int *uaddr_len, int peer)
+			int peer)
 {
 	int rc;
 
-	rc = inet6_getname(sock, uaddr, uaddr_len, peer);
+	rc = inet6_getname(sock, uaddr, peer);
 
-	if (rc != 0)
+	if (rc < 0)
 		return rc;
 
-	*uaddr_len = sctp_v6_addr_to_user(sctp_sk(sock->sk),
+	rc = sctp_v6_addr_to_user(sctp_sk(sock->sk),
 					  (union sctp_addr *)uaddr);
 
 	return rc;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index da1a5cdefd13..38ae22b65e77 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -281,7 +281,6 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock,
 	struct in_device *in_dev;
 	struct sockaddr_in addr;
 	int rc = -ENOENT;
-	int len;
 
 	if (!dst) {
 		rc = -ENOTCONN;
@@ -293,7 +292,7 @@ int smc_netinfo_by_tcpsk(struct socket *clcsock,
 	}
 
 	/* get address to which the internal TCP socket is bound */
-	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
+	kernel_getsockname(clcsock, (struct sockaddr *)&addr);
 	/* analyze IPv4 specific data of net_device belonging to TCP socket */
 	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dst->dev);
@@ -771,7 +770,7 @@ static void smc_listen_work(struct work_struct *work)
 	u8 buf[SMC_CLC_MAX_LEN];
 	struct smc_link *link;
 	int reason_code = 0;
-	int rc = 0, len;
+	int rc = 0;
 	__be32 subnet;
 	u8 prefix_len;
 	u8 ibport;
@@ -824,7 +823,7 @@ static void smc_listen_work(struct work_struct *work)
 	}
 
 	/* get address of the peer connected to the internal TCP socket */
-	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr);
 
 	/* allocate connection / link group */
 	mutex_lock(&smc_create_lgr_pending);
@@ -1075,7 +1074,7 @@ out:
 }
 
 static int smc_getname(struct socket *sock, struct sockaddr *addr,
-		       int *len, int peer)
+		       int peer)
 {
 	struct smc_sock *smc;
 
@@ -1085,7 +1084,7 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr,
 
 	smc = smc_sk(sock->sk);
 
-	return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
+	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
 }
 
 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
diff --git a/net/socket.c b/net/socket.c
index a93c99b518ca..fac8246a8ae8 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1573,8 +1573,9 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
 		goto out_fd;
 
 	if (upeer_sockaddr) {
-		if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
-					  &len, 2) < 0) {
+		len = newsock->ops->getname(newsock,
+					(struct sockaddr *)&address, 2);
+		if (len < 0) {
 			err = -ECONNABORTED;
 			goto out_fd;
 		}
@@ -1654,7 +1655,7 @@ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
 {
 	struct socket *sock;
 	struct sockaddr_storage address;
-	int len, err, fput_needed;
+	int err, fput_needed;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (!sock)
@@ -1664,10 +1665,11 @@ SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr,
 	if (err)
 		goto out_put;
 
-	err = sock->ops->getname(sock, (struct sockaddr *)&address, &len, 0);
-	if (err)
+	err = sock->ops->getname(sock, (struct sockaddr *)&address, 0);
+	if (err < 0)
 		goto out_put;
-	err = move_addr_to_user(&address, len, usockaddr, usockaddr_len);
+        /* "err" is actually length in this case */
+	err = move_addr_to_user(&address, err, usockaddr, usockaddr_len);
 
 out_put:
 	fput_light(sock->file, fput_needed);
@@ -1685,7 +1687,7 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
 {
 	struct socket *sock;
 	struct sockaddr_storage address;
-	int len, err, fput_needed;
+	int err, fput_needed;
 
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (sock != NULL) {
@@ -1695,11 +1697,10 @@ SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr,
 			return err;
 		}
 
-		err =
-		    sock->ops->getname(sock, (struct sockaddr *)&address, &len,
-				       1);
-		if (!err)
-			err = move_addr_to_user(&address, len, usockaddr,
+		err = sock->ops->getname(sock, (struct sockaddr *)&address, 1);
+		if (err >= 0)
+			/* "err" is actually length in this case */
+			err = move_addr_to_user(&address, err, usockaddr,
 						usockaddr_len);
 		fput_light(sock->file, fput_needed);
 	}
@@ -3166,17 +3167,15 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen,
 }
 EXPORT_SYMBOL(kernel_connect);
 
-int kernel_getsockname(struct socket *sock, struct sockaddr *addr,
-			 int *addrlen)
+int kernel_getsockname(struct socket *sock, struct sockaddr *addr)
 {
-	return sock->ops->getname(sock, addr, addrlen, 0);
+	return sock->ops->getname(sock, addr, 0);
 }
 EXPORT_SYMBOL(kernel_getsockname);
 
-int kernel_getpeername(struct socket *sock, struct sockaddr *addr,
-			 int *addrlen)
+int kernel_getpeername(struct socket *sock, struct sockaddr *addr)
 {
-	return sock->ops->getname(sock, addr, addrlen, 1);
+	return sock->ops->getname(sock, addr, 1);
 }
 EXPORT_SYMBOL(kernel_getpeername);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 6e432ecd7f99..806395687bb6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1231,7 +1231,7 @@ static const struct sockaddr_in6 rpc_in6addr_loopback = {
  * negative errno is returned.
  */
 static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen,
-			struct sockaddr *buf, int buflen)
+			struct sockaddr *buf)
 {
 	struct socket *sock;
 	int err;
@@ -1269,7 +1269,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen,
 		goto out_release;
 	}
 
-	err = kernel_getsockname(sock, buf, &buflen);
+	err = kernel_getsockname(sock, buf);
 	if (err < 0) {
 		dprintk("RPC:       getsockname failed (%d)\n", err);
 		goto out_release;
@@ -1353,7 +1353,7 @@ int rpc_localaddr(struct rpc_clnt *clnt, struct sockaddr *buf, size_t buflen)
 	rcu_read_unlock();
 
 	rpc_set_port(sap, 0);
-	err = rpc_sockname(net, sap, salen, buf, buflen);
+	err = rpc_sockname(net, sap, salen, buf);
 	put_net(net);
 	if (err != 0)
 		/* Couldn't discover local address, return ANYADDR */
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 943f2a745cd5..08cd951aaeea 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -832,12 +832,13 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
 	}
 	set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
 
-	err = kernel_getpeername(newsock, sin, &slen);
+	err = kernel_getpeername(newsock, sin);
 	if (err < 0) {
 		net_warn_ratelimited("%s: peername failed (err %d)!\n",
 				     serv->sv_name, -err);
 		goto failed;		/* aborted connection or whatever */
 	}
+	slen = err;
 
 	/* Ideally, we would want to reject connections from unauthorized
 	 * hosts here, but when we get encryption, the IP of the host won't
@@ -866,7 +867,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
 	if (IS_ERR(newsvsk))
 		goto failed;
 	svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
-	err = kernel_getsockname(newsock, sin, &slen);
+	err = kernel_getsockname(newsock, sin);
+	slen = err;
 	if (unlikely(err < 0)) {
 		dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
 		slen = offsetof(struct sockaddr, sa_data);
@@ -1465,7 +1467,8 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
 		err = PTR_ERR(svsk);
 		goto out;
 	}
-	if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0)
+	salen = kernel_getsockname(svsk->sk_sock, sin);
+	if (salen >= 0)
 		svc_xprt_set_local(&svsk->sk_xprt, sin, salen);
 	svc_add_new_perm_xprt(serv, &svsk->sk_xprt);
 	return svc_one_sock_name(svsk, name_return, len);
@@ -1539,10 +1542,10 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
 	if (error < 0)
 		goto bummer;
 
-	newlen = len;
-	error = kernel_getsockname(sock, newsin, &newlen);
+	error = kernel_getsockname(sock, newsin);
 	if (error < 0)
 		goto bummer;
+	newlen = error;
 
 	if (protocol == IPPROTO_TCP) {
 		if ((error = kernel_listen(sock, 64)) < 0)
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index a6b8c1f8f92a..956e29c1438d 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1794,10 +1794,9 @@ static void xs_sock_set_reuseport(struct socket *sock)
 static unsigned short xs_sock_getport(struct socket *sock)
 {
 	struct sockaddr_storage buf;
-	int buflen;
 	unsigned short port = 0;
 
-	if (kernel_getsockname(sock, (struct sockaddr *)&buf, &buflen) < 0)
+	if (kernel_getsockname(sock, (struct sockaddr *)&buf) < 0)
 		goto out;
 	switch (buf.ss_family) {
 	case AF_INET6:
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index b0323ec7971e..f93477187a90 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -665,7 +665,7 @@ exit:
  *       a completely predictable manner).
  */
 static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
-			int *uaddr_len, int peer)
+			int peer)
 {
 	struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
 	struct sock *sk = sock->sk;
@@ -684,13 +684,12 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
 		addr->addr.id.node = tn->own_addr;
 	}
 
-	*uaddr_len = sizeof(*addr);
 	addr->addrtype = TIPC_ADDR_ID;
 	addr->family = AF_TIPC;
 	addr->scope = 0;
 	addr->addr.name.domain = 0;
 
-	return 0;
+	return sizeof(*addr);
 }
 
 /**
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d545e1d0dea2..723698416242 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -637,7 +637,7 @@ static int unix_stream_connect(struct socket *, struct sockaddr *,
 			       int addr_len, int flags);
 static int unix_socketpair(struct socket *, struct socket *);
 static int unix_accept(struct socket *, struct socket *, int, bool);
-static int unix_getname(struct socket *, struct sockaddr *, int *, int);
+static int unix_getname(struct socket *, struct sockaddr *, int);
 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 static __poll_t unix_dgram_poll(struct file *, struct socket *,
 				    poll_table *);
@@ -1453,7 +1453,7 @@ out:
 }
 
 
-static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
+static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
 {
 	struct sock *sk = sock->sk;
 	struct unix_sock *u;
@@ -1476,12 +1476,12 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_
 	if (!u->addr) {
 		sunaddr->sun_family = AF_UNIX;
 		sunaddr->sun_path[0] = 0;
-		*uaddr_len = sizeof(short);
+		err = sizeof(short);
 	} else {
 		struct unix_address *addr = u->addr;
 
-		*uaddr_len = addr->len;
-		memcpy(sunaddr, addr->name, *uaddr_len);
+		err = addr->len;
+		memcpy(sunaddr, addr->name, addr->len);
 	}
 	unix_state_unlock(sk);
 	sock_put(sk);
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index e0fc84daed94..aac9b8f6552e 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -759,7 +759,7 @@ vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 }
 
 static int vsock_getname(struct socket *sock,
-			 struct sockaddr *addr, int *addr_len, int peer)
+			 struct sockaddr *addr, int peer)
 {
 	int err;
 	struct sock *sk;
@@ -794,7 +794,7 @@ static int vsock_getname(struct socket *sock,
 	 */
 	BUILD_BUG_ON(sizeof(*vm_addr) > 128);
 	memcpy(addr, vm_addr, sizeof(*vm_addr));
-	*addr_len = sizeof(*vm_addr);
+	err = sizeof(*vm_addr);
 
 out:
 	release_sock(sk);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 562cc11131f6..d49aa79b7997 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -896,7 +896,7 @@ out:
 }
 
 static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
-		       int *uaddr_len, int peer)
+		       int peer)
 {
 	struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr;
 	struct sock *sk = sock->sk;
@@ -913,7 +913,7 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr,
 		sx25->sx25_addr = x25->source_addr;
 
 	sx25->sx25_family = AF_X25;
-	*uaddr_len = sizeof(*sx25);
+	rc = sizeof(*sx25);
 
 out:
 	return rc;
-- 
cgit v1.2.3


From 60aa80460da115216070082b4ab686b9e37c86ef Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 30 Jan 2018 14:53:48 +0000
Subject: esp4: remove redundant initialization of pointer esph

Pointer esph is being assigned a value that is never read, esph is
re-assigned and only read inside an if statement, hence the
initialization is redundant and can be removed.

Cleans up clang warning:
net/ipv4/esp4.c:657:21: warning: Value stored to 'esph' during
its initialization is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 296d0b956bfe..97689012b357 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -654,7 +654,7 @@ static void esp_input_restore_header(struct sk_buff *skb)
 static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
 {
 	struct xfrm_state *x = xfrm_input_state(skb);
-	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+	struct ip_esp_hdr *esph;
 
 	/* For ESN we move the header forward by 4 bytes to
 	 * accomodate the high bits.  We will move it back after
-- 
cgit v1.2.3


From 98f6c533a3e98f21305575f0cf87cdb6c2210c43 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:26:02 +0300
Subject: net: Assign net to net_namespace_list in setup_net()

This patch merges two repeating pieces of code in one,
and they will live in setup_net() now.

The only change is that assignment:

	init_net_initialized = true;

becomes reordered with:

	list_add_tail_rcu(&net->list, &net_namespace_list);

The order does not have visible effect, and it is a simple
cleanup because of:

init_net_initialized is used in !CONFIG_NET_NS case
to order proc_net_ns_ops registration occuring at boot time:

	start_kernel()->proc_root_init()->proc_net_init(),
with
	net_ns_init()->setup_net(&init_net, &init_user_ns)

also occuring in boot time from the same init_task.

When there are no another tasks to race with them,
for the single task it does not matter, which order
two sequential independent loads should be made.
So we make them reordered.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3cad5f51afd3..1180c217895a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -303,6 +303,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 		if (error < 0)
 			goto out_undo;
 	}
+	rtnl_lock();
+	list_add_tail_rcu(&net->list, &net_namespace_list);
+	rtnl_unlock();
 out:
 	return error;
 
@@ -424,11 +427,6 @@ struct net *copy_net_ns(unsigned long flags,
 
 	net->ucounts = ucounts;
 	rv = setup_net(net, user_ns);
-	if (rv == 0) {
-		rtnl_lock();
-		list_add_tail_rcu(&net->list, &net_namespace_list);
-		rtnl_unlock();
-	}
 	mutex_unlock(&net_mutex);
 	if (rv < 0) {
 		dec_net_namespaces(ucounts);
@@ -880,11 +878,6 @@ static int __init net_ns_init(void)
 		panic("Could not setup the initial network namespace");
 
 	init_net_initialized = true;
-
-	rtnl_lock();
-	list_add_tail_rcu(&init_net.list, &net_namespace_list);
-	rtnl_unlock();
-
 	mutex_unlock(&net_mutex);
 
 	register_pernet_subsys(&net_ns_ops);
-- 
cgit v1.2.3


From 5ba049a5cc8e24a1643df75bbf65b4efa070fa74 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:26:13 +0300
Subject: net: Cleanup in copy_net_ns()

Line up destructors actions in the revers order
to constructors. Next patches will add more actions,
and this will be comfortable, if there is the such
order.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1180c217895a..81384386f91b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -411,27 +411,25 @@ struct net *copy_net_ns(unsigned long flags,
 
 	net = net_alloc();
 	if (!net) {
-		dec_net_namespaces(ucounts);
-		return ERR_PTR(-ENOMEM);
+		rv = -ENOMEM;
+		goto dec_ucounts;
 	}
-
+	refcount_set(&net->passive, 1);
+	net->ucounts = ucounts;
 	get_user_ns(user_ns);
 
 	rv = mutex_lock_killable(&net_mutex);
-	if (rv < 0) {
-		net_free(net);
-		dec_net_namespaces(ucounts);
-		put_user_ns(user_ns);
-		return ERR_PTR(rv);
-	}
+	if (rv < 0)
+		goto put_userns;
 
-	net->ucounts = ucounts;
 	rv = setup_net(net, user_ns);
 	mutex_unlock(&net_mutex);
 	if (rv < 0) {
-		dec_net_namespaces(ucounts);
+put_userns:
 		put_user_ns(user_ns);
 		net_drop_ns(net);
+dec_ucounts:
+		dec_net_namespaces(ucounts);
 		return ERR_PTR(rv);
 	}
 	return net;
-- 
cgit v1.2.3


From 1a57feb847c56d6193f67d0e892c24e71f9e3ab1 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:26:23 +0300
Subject: net: Introduce net_sem for protection of pernet_list

Currently, the mutex is mostly used to protect pernet operations
list. It orders setup_net() and cleanup_net() with parallel
{un,}register_pernet_operations() calls, so ->exit{,batch} methods
of the same pernet operations are executed for a dying net, as
were used to call ->init methods, even after the net namespace
is unlinked from net_namespace_list in cleanup_net().

But there are several problems with scalability. The first one
is that more than one net can't be created or destroyed
at the same moment on the node. For big machines with many cpus
running many containers it's very sensitive.

The second one is that it's need to synchronize_rcu() after net
is removed from net_namespace_list():

Destroy net_ns:
cleanup_net()
  mutex_lock(&net_mutex)
  list_del_rcu(&net->list)
  synchronize_rcu()                                  <--- Sleep there for ages
  list_for_each_entry_reverse(ops, &pernet_list, list)
    ops_exit_list(ops, &net_exit_list)
  list_for_each_entry_reverse(ops, &pernet_list, list)
    ops_free_list(ops, &net_exit_list)
  mutex_unlock(&net_mutex)

This primitive is not fast, especially on the systems with many processors
and/or when preemptible RCU is enabled in config. So, all the time, while
cleanup_net() is waiting for RCU grace period, creation of new net namespaces
is not possible, the tasks, who makes it, are sleeping on the same mutex:

Create net_ns:
copy_net_ns()
  mutex_lock_killable(&net_mutex)                    <--- Sleep there for ages

I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop
with preemptible RCU enabled after CRIU tests round is finished.

The solution is to convert net_mutex to the rw_semaphore and add fine grain
locks to really small number of pernet_operations, what really need them.

Then, pernet_operations::init/::exit methods, modifying the net-related data,
will require down_read() locking only, while down_write() will be used
for changing pernet_list (i.e., when modules are being loaded and unloaded).

This gives signify performance increase, after all patch set is applied,
like you may see here:

%for i in {1..10000}; do unshare -n bash -c exit; done

*before*
real 1m40,377s
user 0m9,672s
sys 0m19,928s

*after*
real 0m17,007s
user 0m5,311s
sys 0m11,779

(5.8 times faster)

This patch starts replacing net_mutex to net_sem. It adds rw_semaphore,
describes the variables it protects, and makes to use, where appropriate.
net_mutex is still present, and next patches will kick it out step-by-step.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 39 ++++++++++++++++++++++++++-------------
 net/core/rtnetlink.c     |  4 ++--
 2 files changed, 28 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 81384386f91b..e89b2b7abd36 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -41,6 +41,11 @@ struct net init_net = {
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
+/*
+ * net_sem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_device pointer.
+ */
+DECLARE_RWSEM(net_sem);
 
 #define MIN_PERNET_OPS_ID	\
 	((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -286,7 +291,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
  */
 static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 {
-	/* Must be called with net_mutex held */
+	/* Must be called with net_sem held */
 	const struct pernet_operations *ops, *saved_ops;
 	int error = 0;
 	LIST_HEAD(net_exit_list);
@@ -418,12 +423,16 @@ struct net *copy_net_ns(unsigned long flags,
 	net->ucounts = ucounts;
 	get_user_ns(user_ns);
 
-	rv = mutex_lock_killable(&net_mutex);
+	rv = down_read_killable(&net_sem);
 	if (rv < 0)
 		goto put_userns;
-
+	rv = mutex_lock_killable(&net_mutex);
+	if (rv < 0)
+		goto up_read;
 	rv = setup_net(net, user_ns);
 	mutex_unlock(&net_mutex);
+up_read:
+	up_read(&net_sem);
 	if (rv < 0) {
 put_userns:
 		put_user_ns(user_ns);
@@ -477,6 +486,7 @@ static void cleanup_net(struct work_struct *work)
 	list_replace_init(&cleanup_list, &net_kill_list);
 	spin_unlock_irq(&cleanup_list_lock);
 
+	down_read(&net_sem);
 	mutex_lock(&net_mutex);
 
 	/* Don't let anyone else find us. */
@@ -517,6 +527,7 @@ static void cleanup_net(struct work_struct *work)
 		ops_free_list(ops, &net_exit_list);
 
 	mutex_unlock(&net_mutex);
+	up_read(&net_sem);
 
 	/* Ensure there are no outstanding rcu callbacks using this
 	 * network namespace.
@@ -543,8 +554,10 @@ static void cleanup_net(struct work_struct *work)
  */
 void net_ns_barrier(void)
 {
+	down_write(&net_sem);
 	mutex_lock(&net_mutex);
 	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 }
 EXPORT_SYMBOL(net_ns_barrier);
 
@@ -871,12 +884,12 @@ static int __init net_ns_init(void)
 
 	rcu_assign_pointer(init_net.gen, ng);
 
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	if (setup_net(&init_net, &init_user_ns))
 		panic("Could not setup the initial network namespace");
 
 	init_net_initialized = true;
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 
 	register_pernet_subsys(&net_ns_ops);
 
@@ -1016,9 +1029,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
 int register_pernet_subsys(struct pernet_operations *ops)
 {
 	int error;
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	error =  register_pernet_operations(first_device, ops);
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 	return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -1034,9 +1047,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
  */
 void unregister_pernet_subsys(struct pernet_operations *ops)
 {
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	unregister_pernet_operations(ops);
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 
@@ -1062,11 +1075,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 int register_pernet_device(struct pernet_operations *ops)
 {
 	int error;
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	error = register_pernet_operations(&pernet_list, ops);
 	if (!error && (first_device == &pernet_list))
 		first_device = &ops->list;
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 	return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1082,11 +1095,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
  */
 void unregister_pernet_device(struct pernet_operations *ops)
 {
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	if (&ops->list == first_device)
 		first_device = first_device->next;
 	unregister_pernet_operations(ops);
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bc290413a49d..257e7bbaffba 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -454,11 +454,11 @@ static void rtnl_lock_unregistering_all(void)
 void rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
 	/* Close the race with cleanup_net() */
-	mutex_lock(&net_mutex);
+	down_write(&net_sem);
 	rtnl_lock_unregistering_all();
 	__rtnl_link_unregister(ops);
 	rtnl_unlock();
-	mutex_unlock(&net_mutex);
+	up_write(&net_sem);
 }
 EXPORT_SYMBOL_GPL(rtnl_link_unregister);
 
-- 
cgit v1.2.3


From bcab1ddd9b2b105390712a9c1605bdb20a7f9a03 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:26:33 +0300
Subject: net: Move mutex_unlock() in cleanup_net() up

net_sem protects from pernet_list changing, while
ops_free_list() makes simple kfree(), and it can't
race with other pernet_operations callbacks.

So we may release net_mutex earlier then it was.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e89b2b7abd36..f8453c438798 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -522,11 +522,12 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_exit_list(ops, &net_exit_list);
 
+	mutex_unlock(&net_mutex);
+
 	/* Free the net generic variables */
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
 
-	mutex_unlock(&net_mutex);
 	up_read(&net_sem);
 
 	/* Ensure there are no outstanding rcu callbacks using this
-- 
cgit v1.2.3


From 447cd7a0d7d1e5b4486e99cce289654fec9951e3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:26:44 +0300
Subject: net: Allow pernet_operations to be executed in parallel

This adds new pernet_operations::async flag to indicate operations,
which ->init(), ->exit() and ->exit_batch() methods are allowed
to be executed in parallel with the methods of any other pernet_operations.

When there are only asynchronous pernet_operations in the system,
net_mutex won't be taken for a net construction and destruction.

Also, remove BUG_ON(mutex_is_locked()) from net_assign_generic()
without replacing with the equivalent net_sem check, as there is
one more lockdep assert below.

v3: Add comment near net_mutex.

Suggested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index f8453c438798..2a01ff32d9c7 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -29,6 +29,7 @@
 
 static LIST_HEAD(pernet_list);
 static struct list_head *first_device = &pernet_list;
+/* Used only if there are !async pernet_operations registered */
 DEFINE_MUTEX(net_mutex);
 
 LIST_HEAD(net_namespace_list);
@@ -41,8 +42,9 @@ struct net init_net = {
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
+static unsigned nr_sync_pernet_ops;
 /*
- * net_sem: protects: pernet_list, net_generic_ids,
+ * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops,
  * init_net_initialized and first_device pointer.
  */
 DECLARE_RWSEM(net_sem);
@@ -70,11 +72,10 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 {
 	struct net_generic *ng, *old_ng;
 
-	BUG_ON(!mutex_is_locked(&net_mutex));
 	BUG_ON(id < MIN_PERNET_OPS_ID);
 
 	old_ng = rcu_dereference_protected(net->gen,
-					   lockdep_is_held(&net_mutex));
+					   lockdep_is_held(&net_sem));
 	if (old_ng->s.len > id) {
 		old_ng->ptr[id] = data;
 		return 0;
@@ -426,11 +427,14 @@ struct net *copy_net_ns(unsigned long flags,
 	rv = down_read_killable(&net_sem);
 	if (rv < 0)
 		goto put_userns;
-	rv = mutex_lock_killable(&net_mutex);
-	if (rv < 0)
-		goto up_read;
+	if (nr_sync_pernet_ops) {
+		rv = mutex_lock_killable(&net_mutex);
+		if (rv < 0)
+			goto up_read;
+	}
 	rv = setup_net(net, user_ns);
-	mutex_unlock(&net_mutex);
+	if (nr_sync_pernet_ops)
+		mutex_unlock(&net_mutex);
 up_read:
 	up_read(&net_sem);
 	if (rv < 0) {
@@ -487,7 +491,8 @@ static void cleanup_net(struct work_struct *work)
 	spin_unlock_irq(&cleanup_list_lock);
 
 	down_read(&net_sem);
-	mutex_lock(&net_mutex);
+	if (nr_sync_pernet_ops)
+		mutex_lock(&net_mutex);
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
@@ -522,7 +527,8 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_exit_list(ops, &net_exit_list);
 
-	mutex_unlock(&net_mutex);
+	if (nr_sync_pernet_ops)
+		mutex_unlock(&net_mutex);
 
 	/* Free the net generic variables */
 	list_for_each_entry_reverse(ops, &pernet_list, list)
@@ -994,6 +1000,9 @@ again:
 		rcu_barrier();
 		if (ops->id)
 			ida_remove(&net_generic_ids, *ops->id);
+	} else if (!ops->async) {
+		pr_info_once("Pernet operations %ps are sync.\n", ops);
+		nr_sync_pernet_ops++;
 	}
 
 	return error;
@@ -1001,7 +1010,8 @@ again:
 
 static void unregister_pernet_operations(struct pernet_operations *ops)
 {
-	
+	if (!ops->async)
+		BUG_ON(nr_sync_pernet_ops-- == 0);
 	__unregister_pernet_operations(ops);
 	rcu_barrier();
 	if (ops->id)
-- 
cgit v1.2.3


From 3fc3b827f0c4397c74d4b8a8a06d71b903a4982f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:27:03 +0300
Subject: net: Convert net_ns_ops methods

This patch starts to convert pernet_subsys, registered
from pure initcalls.

net_ns_ops::net_ns_net_init/net_ns_net_init, methods use only
ida_simple_* functions, which are not need a synchronization.
They are synchronized by idr subsystem.

So, net_ns_ops methods are able to be executed
in parallel with methods of other pernet operations.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2a01ff32d9c7..e21c564c8c00 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -649,6 +649,7 @@ static __net_exit void net_ns_net_exit(struct net *net)
 static struct pernet_operations __net_initdata net_ns_ops = {
 	.init = net_ns_net_init,
 	.exit = net_ns_net_exit,
+	.async = true,
 };
 
 static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
-- 
cgit v1.2.3


From 93d230fe0762bf129fb4debc944af3e1c1e8f40e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:27:13 +0300
Subject: net: Convert sysctl_pernet_ops

This patch starts to convert pernet_subsys, registered
from core initcalls.

Methods sysctl_net_init() and sysctl_net_exit() initialize
net::sysctls table of a namespace.

pernet_operations::init()/exit() methods from the rest
of the list do not touch net::sysctls of strangers,
so it's safe to execute sysctl_pernet_ops's methods
in parallel with any other pernet_operations.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sysctl_net.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 9aed6fe1bf1a..f424539829b7 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -89,6 +89,7 @@ static void __net_exit sysctl_net_exit(struct net *net)
 static struct pernet_operations sysctl_pernet_ops = {
 	.init = sysctl_net_init,
 	.exit = sysctl_net_exit,
+	.async = true,
 };
 
 static struct ctl_table_header *net_header;
-- 
cgit v1.2.3


From 9549929923738d14dc450f0427a4b98ac3e8aff2 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:27:23 +0300
Subject: net: Convert netfilter_net_ops

Methods netfilter_net_init() and netfilter_net_exit()
initialize net::nf::hooks and change net-related proc
directory of net. Another pernet_operations are not
interested in forein net::nf::hooks or proc entries,
so it's safe to make them executed in parallel with
methods of other pernet operations.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 0f6b8172fb9a..d72cc786c7b7 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -629,6 +629,7 @@ static void __net_exit netfilter_net_exit(struct net *net)
 static struct pernet_operations netfilter_net_ops = {
 	.init = netfilter_net_init,
 	.exit = netfilter_net_exit,
+	.async = true,
 };
 
 int __init netfilter_init(void)
-- 
cgit v1.2.3


From c9d8fb91351fe1514731b7f6d24d4decc0028978 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:27:31 +0300
Subject: net: Convert nf_log_net_ops

The pernet_operations would have had a problem in parallel
execution with others, if init_net had been able to released.
But it's not, and the rest is safe for that.
There is memory allocation, which nobody else interested in,
and sysctl registration. So, we make them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_log.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index c2c1b16b7538..1ba3da51050d 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -577,6 +577,7 @@ static void __net_exit nf_log_net_exit(struct net *net)
 static struct pernet_operations nf_log_net_ops = {
 	.init = nf_log_net_init,
 	.exit = nf_log_net_exit,
+	.async = true,
 };
 
 int __init netfilter_log_init(void)
-- 
cgit v1.2.3


From 604da74e4fc1c2afc50dc7a1850acfd9aff2b58d Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:27:41 +0300
Subject: net: Convert net_inuse_ops

net_inuse_ops methods expose statistics in /proc.
No one from the rest of pernet_subsys or pernet_device
lists touch net::core::inuse.

So, it's safe to make net_inuse_ops async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 04e5e27c9b81..f2bf69b86c58 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3112,6 +3112,7 @@ static void __net_exit sock_inuse_exit_net(struct net *net)
 static struct pernet_operations net_inuse_ops = {
 	.init = sock_inuse_init_net,
 	.exit = sock_inuse_exit_net,
+	.async = true,
 };
 
 static __init int net_inuse_init(void)
-- 
cgit v1.2.3


From ff291d005a988aaa7d73daf44c3d04585e9f0637 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:27:51 +0300
Subject: net: Convert net_defaults_ops

net_defaults_ops introduce only net_defaults_init_net method,
and it acts on net::core::sysctl_somaxconn, which
is not interesting for the rest of pernet_subsys and
pernet_device lists. Then, make them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e21c564c8c00..bcab9a938d6f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -340,6 +340,7 @@ static int __net_init net_defaults_init_net(struct net *net)
 
 static struct pernet_operations net_defaults_ops = {
 	.init = net_defaults_init_net,
+	.async = true,
 };
 
 static __init int net_defaults_init(void)
-- 
cgit v1.2.3


From 194b95d2166661f890dca5ef3b952c73622eb4e5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:28:00 +0300
Subject: net: Convert netlink_net_ops

The methods of netlink_net_ops create and destroy "netlink"
file, which are not interesting for foreigh pernet_operations.
So, netlink_net_ops may safely be made async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 3c8af14330b5..b3065908e146 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2723,6 +2723,7 @@ static void __init netlink_add_usersock_entry(void)
 static struct pernet_operations __net_initdata netlink_net_ops = {
 	.init = netlink_net_init,
 	.exit = netlink_net_exit,
+	.async = true,
 };
 
 static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
-- 
cgit v1.2.3


From 46456675ec1b7f93dadd2b1b4b58d763c3ae9266 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:28:15 +0300
Subject: net: Convert rtnetlink_net_ops

rtnetlink_net_init() and rtnetlink_net_exit()
create and destroy netlink socket net::rtnl.

The socket is used to send rtnl notification via
rtnl_net_notifyid(). There is no a problem
to create and destroy it in parallel with other
pernet operations, as we link net in setup_net()
after the socket is created, and destroy
in cleanup_net() after net is unhashed from all
the lists and there is no RCU references on it.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 257e7bbaffba..67f375cfb982 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4724,6 +4724,7 @@ static void __net_exit rtnetlink_net_exit(struct net *net)
 static struct pernet_operations rtnetlink_net_ops = {
 	.init = rtnetlink_net_init,
 	.exit = rtnetlink_net_exit,
+	.async = true,
 };
 
 void __init rtnetlink_init(void)
-- 
cgit v1.2.3


From 36b0068e6c9892aa4757d4fa08fd14fbba72b3b3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:28:44 +0300
Subject: net: Convert proto_net_ops

This patch starts to convert pernet_subsys, registered
from subsys initcalls.

It seems safe to be executed in parallel with others,
as it's only creates/destoyes proc entry,
which nobody else is not interested in.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index f2bf69b86c58..e90d461748f0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3386,6 +3386,7 @@ static __net_exit void proto_exit_net(struct net *net)
 static __net_initdata struct pernet_operations proto_net_ops = {
 	.init = proto_init_net,
 	.exit = proto_exit_net,
+	.async = true,
 };
 
 static int __init proto_init(void)
-- 
cgit v1.2.3


From 88b8ffebdb4d0f4da4e9a8383c8478c32372b42b Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:28:54 +0300
Subject: net: Convert pernet_subsys ops, registered via net_dev_init()

There are:
1)dev_proc_ops and dev_mc_net_ops, which create and destroy
pernet proc file and not interesting for another net namespaces;
2)netdev_net_ops, which creates pernet hashes, which are not
touched by another pernet_operations.

So, make them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c        | 1 +
 net/core/net-procfs.c | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index dda9d7b9a840..dc7506f00a66 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8833,6 +8833,7 @@ static void __net_exit netdev_exit(struct net *net)
 static struct pernet_operations __net_initdata netdev_net_ops = {
 	.init = netdev_init,
 	.exit = netdev_exit,
+	.async = true,
 };
 
 static void __net_exit default_device_exit(struct net *net)
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index e010bb800d7b..65b51e778782 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -349,6 +349,7 @@ static void __net_exit dev_proc_net_exit(struct net *net)
 static struct pernet_operations __net_initdata dev_proc_ops = {
 	.init = dev_proc_net_init,
 	.exit = dev_proc_net_exit,
+	.async = true,
 };
 
 static int dev_mc_seq_show(struct seq_file *seq, void *v)
@@ -405,6 +406,7 @@ static void __net_exit dev_mc_net_exit(struct net *net)
 static struct pernet_operations __net_initdata dev_mc_net_ops = {
 	.init = dev_mc_net_init,
 	.exit = dev_mc_net_exit,
+	.async = true,
 };
 
 int __init dev_proc_init(void)
-- 
cgit v1.2.3


From 86b63418fd382b095fdac4408fd565aa5da4b036 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:29:03 +0300
Subject: net: Convert fib_* pernet_operations, registered via subsys_initcall

Both of them create and initialize lists, which are not touched
by another foreing pernet_operations.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_notifier.c | 1 +
 net/core/fib_rules.c    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 0c048bdeb016..5ace0705a3f9 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -171,6 +171,7 @@ static void __net_exit fib_notifier_net_exit(struct net *net)
 static struct pernet_operations fib_notifier_net_ops = {
 	.init = fib_notifier_net_init,
 	.exit = fib_notifier_net_exit,
+	.async = true,
 };
 
 static int __init fib_notifier_init(void)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 98e1066c3d55..cb071b8e8d17 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -1030,6 +1030,7 @@ static void __net_exit fib_rules_net_exit(struct net *net)
 static struct pernet_operations fib_rules_net_ops = {
 	.init = fib_rules_net_init,
 	.exit = fib_rules_net_exit,
+	.async = true,
 };
 
 static int __init fib_rules_init(void)
-- 
cgit v1.2.3


From 13da199c38ee7f33a1c42db62647118f9f9f527c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:29:13 +0300
Subject: net: Convert subsys_initcall() registered pernet_operations from
 net/sched

psched_net_ops only creates and destroyes /proc entry,
and safe to be executed in parallel with any foreigh
pernet_operations.

tcf_action_net_ops initializes and destructs tcf_action_net::egdev_ht,
which is not touched by foreign pernet_operations.

So, make them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 1 +
 net/sched/sch_api.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index eba6682727dd..4886ea4a7d6e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1454,6 +1454,7 @@ static struct pernet_operations tcf_action_net_ops = {
 	.exit = tcf_action_net_exit,
 	.id = &tcf_action_net_id,
 	.size = sizeof(struct tcf_action_net),
+	.async = true,
 };
 
 static int __init tc_action_init(void)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d512f49ee83c..27e672c12492 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2128,6 +2128,7 @@ static void __net_exit psched_net_exit(struct net *net)
 static struct pernet_operations psched_net_ops = {
 	.init = psched_net_init,
 	.exit = psched_net_exit,
+	.async = true,
 };
 
 static int __init pktsched_init(void)
-- 
cgit v1.2.3


From 83caf62c867bed2637d4f3713839b0c414d6d966 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:29:23 +0300
Subject: net: Convert genl_pernet_ops

This pernet_operations create and destroy net::genl_sock.
Foreign pernet_operations don't touch it.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/genetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 6f02499ef007..a6f63a5faee7 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1035,6 +1035,7 @@ static void __net_exit genl_pernet_exit(struct net *net)
 static struct pernet_operations genl_pernet_ops = {
 	.init = genl_pernet_init,
 	.exit = genl_pernet_exit,
+	.async = true,
 };
 
 static int __init genl_init(void)
-- 
cgit v1.2.3


From 6c0075d0f6ccf5be4527408830852106808ab2bb Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:29:33 +0300
Subject: net: Convert wext_pernet_ops

These pernet_operations initialize and purge net::wext_nlevents
queue, and are not touched by foreign pernet_operations.

Mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/wireless/wext-core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 9efbfc753347..bc7064486b15 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -390,6 +390,7 @@ static void __net_exit wext_pernet_exit(struct net *net)
 static struct pernet_operations wext_pernet_ops = {
 	.init = wext_pernet_init,
 	.exit = wext_pernet_exit,
+	.async = true,
 };
 
 static int __init wireless_nlevent_init(void)
-- 
cgit v1.2.3


From 232cf06c611f57ccb8e321de3fd850f21215ede8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:29:42 +0300
Subject: net: Convert sysctl_core_ops

These pernet_operations register and destroy sysctl
directory, and it's not interesting for foreign
pernet_operations.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sysctl_net_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f2d0462611c3..d714f65782b7 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -572,6 +572,7 @@ static __net_exit void sysctl_core_net_exit(struct net *net)
 static __net_initdata struct pernet_operations sysctl_core_ops = {
 	.init = sysctl_core_net_init,
 	.exit = sysctl_core_net_exit,
+	.async = true,
 };
 
 static __init int sysctl_core_init(void)
-- 
cgit v1.2.3


From f84c6821aa540342360067604ad156e3d53a67ed Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:29:52 +0300
Subject: net: Convert pernet_subsys, registered from inet_init()

arp_net_ops just addr/removes /proc entry.

devinet_ops allocates and frees duplicate of init_net tables
and (un)registers sysctl entries.

fib_net_ops allocates and frees pernet tables, creates/destroys
netlink socket and (un)initializes /proc entries. Foreign
pernet_operations do not touch them.

ip_rt_proc_ops only modifies pernet /proc entries.

xfrm_net_ops creates/destroys /proc entries, allocates/frees
pernet statistics, hashes and tables, and (un)initializes
sysctl files. These are not touched by foreigh pernet_operations

xfrm4_net_ops allocates/frees private pernet memory, and
configures sysctls.

sysctl_route_ops creates/destroys sysctls.

rt_genid_ops only initializes fields of just allocated net.

ipv4_inetpeer_ops allocated/frees net private memory.

igmp_net_ops just creates/destroys /proc files and socket,
noone else interested in.

tcp_sk_ops seems to be safe, because tcp_sk_init() does not
depend on any other pernet_operations modifications. Iteration
over hash table in inet_twsk_purge() is made under RCU lock,
and it's safe to iterate the table this way. Removing from
the table happen from inet_twsk_deschedule_put(), but this
function is safe without any extern locks, as it's synchronized
inside itself. There are many examples, it's used in different
context. So, it's safe to leave tcp_sk_exit_batch() unlocked.

tcp_net_metrics_ops is synchronized on tcp_metrics_lock and safe.

udplite4_net_ops only creates/destroys pernet /proc file.

icmp_sk_ops creates percpu sockets, not touched by foreign
pernet_operations.

ipmr_net_ops creates/destroys pernet fib tables, (un)registers
fib rules and /proc files. This seem to be safe to execute
in parallel with foreign pernet_operations.

af_inet_ops just sets up default parameters of newly created net.

ipv4_mib_ops creates and destroys pernet percpu statistics.

raw_net_ops, tcp4_net_ops, udp4_net_ops, ping_v4_net_ops
and ip_proc_ops only create/destroy pernet /proc files.

ip4_frags_ops creates and destroys sysctl file.

So, it's safe to make the pernet_operations async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c      | 2 ++
 net/ipv4/arp.c          | 1 +
 net/ipv4/devinet.c      | 1 +
 net/ipv4/fib_frontend.c | 1 +
 net/ipv4/icmp.c         | 1 +
 net/ipv4/igmp.c         | 1 +
 net/ipv4/ip_fragment.c  | 1 +
 net/ipv4/ipmr.c         | 1 +
 net/ipv4/ping.c         | 1 +
 net/ipv4/proc.c         | 1 +
 net/ipv4/raw.c          | 1 +
 net/ipv4/route.c        | 4 ++++
 net/ipv4/tcp_ipv4.c     | 2 ++
 net/ipv4/tcp_metrics.c  | 1 +
 net/ipv4/udp.c          | 1 +
 net/ipv4/udplite.c      | 1 +
 net/ipv4/xfrm4_policy.c | 1 +
 net/xfrm/xfrm_policy.c  | 1 +
 18 files changed, 23 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f98e2f0db841..e8c7fad8c329 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1735,6 +1735,7 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
 static __net_initdata struct pernet_operations ipv4_mib_ops = {
 	.init = ipv4_mib_init_net,
 	.exit = ipv4_mib_exit_net,
+	.async = true,
 };
 
 static int __init init_ipv4_mibs(void)
@@ -1788,6 +1789,7 @@ static __net_exit void inet_exit_net(struct net *net)
 static __net_initdata struct pernet_operations af_inet_ops = {
 	.init = inet_init_net,
 	.exit = inet_exit_net,
+	.async = true,
 };
 
 static int __init init_inet_pernet_ops(void)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f28f06c91ead..7dc9de8444a9 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1447,6 +1447,7 @@ static void __net_exit arp_net_exit(struct net *net)
 static struct pernet_operations arp_net_ops = {
 	.init = arp_net_init,
 	.exit = arp_net_exit,
+	.async = true,
 };
 
 static int __init arp_proc_init(void)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..5ae0d1f097ca 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2469,6 +2469,7 @@ static __net_exit void devinet_exit_net(struct net *net)
 static __net_initdata struct pernet_operations devinet_ops = {
 	.init = devinet_init_net,
 	.exit = devinet_exit_net,
+	.async = true,
 };
 
 static struct rtnl_af_ops inet_af_ops __read_mostly = {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index f05afaf3235c..ac71c3d496c0 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1362,6 +1362,7 @@ static void __net_exit fib_net_exit(struct net *net)
 static struct pernet_operations fib_net_ops = {
 	.init = fib_net_init,
 	.exit = fib_net_exit,
+	.async = true,
 };
 
 void __init ip_fib_init(void)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 1617604c9284..cc56efa64d5c 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1257,6 +1257,7 @@ fail:
 static struct pernet_operations __net_initdata icmp_sk_ops = {
        .init = icmp_sk_init,
        .exit = icmp_sk_exit,
+       .async = true,
 };
 
 int __init icmp_init(void)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index f2402581fef1..c2743763777e 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -3028,6 +3028,7 @@ static void __net_exit igmp_net_exit(struct net *net)
 static struct pernet_operations igmp_net_ops = {
 	.init = igmp_net_init,
 	.exit = igmp_net_exit,
+	.async = true,
 };
 #endif
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbf1b94942c0..5e843ae5e468 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -885,6 +885,7 @@ static void __net_exit ipv4_frags_exit_net(struct net *net)
 static struct pernet_operations ip4_frags_ops = {
 	.init = ipv4_frags_init_net,
 	.exit = ipv4_frags_exit_net,
+	.async = true,
 };
 
 void __init ipfrag_init(void)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index b05689bbba31..7c7ac9d32e77 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3327,6 +3327,7 @@ static void __net_exit ipmr_net_exit(struct net *net)
 static struct pernet_operations ipmr_net_ops = {
 	.init = ipmr_net_init,
 	.exit = ipmr_net_exit,
+	.async = true,
 };
 
 int __init ip_mr_init(void)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index b8f0db54b197..0164def9c808 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1204,6 +1204,7 @@ static void __net_exit ping_v4_proc_exit_net(struct net *net)
 static struct pernet_operations ping_v4_net_ops = {
 	.init = ping_v4_proc_init_net,
 	.exit = ping_v4_proc_exit_net,
+	.async = true,
 };
 
 int __init ping_proc_init(void)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index dc5edc8f7564..fdabc70283b6 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -549,6 +549,7 @@ static __net_exit void ip_proc_exit_net(struct net *net)
 static __net_initdata struct pernet_operations ip_proc_ops = {
 	.init = ip_proc_init_net,
 	.exit = ip_proc_exit_net,
+	.async = true,
 };
 
 int __init ip_misc_proc_init(void)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 9b367fc48d7d..54648d20bf0f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1156,6 +1156,7 @@ static __net_exit void raw_exit_net(struct net *net)
 static __net_initdata struct pernet_operations raw_net_ops = {
 	.init = raw_init_net,
 	.exit = raw_exit_net,
+	.async = true,
 };
 
 int __init raw_proc_init(void)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 49cc1c1df1ba..9376ed69ffeb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -417,6 +417,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 	.init = ip_rt_do_proc_init,
 	.exit = ip_rt_do_proc_exit,
+	.async = true,
 };
 
 static int __init ip_rt_proc_init(void)
@@ -2994,6 +2995,7 @@ static __net_exit void sysctl_route_net_exit(struct net *net)
 static __net_initdata struct pernet_operations sysctl_route_ops = {
 	.init = sysctl_route_net_init,
 	.exit = sysctl_route_net_exit,
+	.async = true,
 };
 #endif
 
@@ -3007,6 +3009,7 @@ static __net_init int rt_genid_init(struct net *net)
 
 static __net_initdata struct pernet_operations rt_genid_ops = {
 	.init = rt_genid_init,
+	.async = true,
 };
 
 static int __net_init ipv4_inetpeer_init(struct net *net)
@@ -3032,6 +3035,7 @@ static void __net_exit ipv4_inetpeer_exit(struct net *net)
 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
 	.init	=	ipv4_inetpeer_init,
 	.exit	=	ipv4_inetpeer_exit,
+	.async	=	true,
 };
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f8ad397e285e..ac16795486ea 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2387,6 +2387,7 @@ static void __net_exit tcp4_proc_exit_net(struct net *net)
 static struct pernet_operations tcp4_net_ops = {
 	.init = tcp4_proc_init_net,
 	.exit = tcp4_proc_exit_net,
+	.async = true,
 };
 
 int __init tcp4_proc_init(void)
@@ -2573,6 +2574,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
        .init	   = tcp_sk_init,
        .exit	   = tcp_sk_exit,
        .exit_batch = tcp_sk_exit_batch,
+       .async	   = true,
 };
 
 void __init tcp_v4_init(void)
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03b51cdcc731..aa6fea9f3328 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1024,6 +1024,7 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis
 static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
 	.init		=	tcp_net_metrics_init,
 	.exit_batch	=	tcp_net_metrics_exit_batch,
+	.async		=	true,
 };
 
 void __init tcp_metrics_init(void)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bfaefe560b5c..ac5fac0e59b1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2757,6 +2757,7 @@ static void __net_exit udp4_proc_exit_net(struct net *net)
 static struct pernet_operations udp4_net_ops = {
 	.init = udp4_proc_init_net,
 	.exit = udp4_proc_exit_net,
+	.async = true,
 };
 
 int __init udp4_proc_init(void)
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index f96614e9b9a5..72f2c3806408 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -104,6 +104,7 @@ static void __net_exit udplite4_proc_exit_net(struct net *net)
 static struct pernet_operations udplite4_net_ops = {
 	.init = udplite4_proc_init_net,
 	.exit = udplite4_proc_exit_net,
+	.async = true,
 };
 
 static __init int udplite4_proc_init(void)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 05017e2c849c..753f526cf9db 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -365,6 +365,7 @@ static void __net_exit xfrm4_net_exit(struct net *net)
 static struct pernet_operations __net_initdata xfrm4_net_ops = {
 	.init	= xfrm4_net_init,
 	.exit	= xfrm4_net_exit,
+	.async	= true,
 };
 
 static void __init xfrm4_policy_init(void)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078132cf..77d9d1ab05ce 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2982,6 +2982,7 @@ static void __net_exit xfrm_net_exit(struct net *net)
 static struct pernet_operations __net_initdata xfrm_net_ops = {
 	.init = xfrm_net_init,
 	.exit = xfrm_net_exit,
+	.async = true,
 };
 
 void __init xfrm_init(void)
-- 
cgit v1.2.3


From 167f7ac723e5b4ea22c44a0bd8e357bb76a68cd2 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:30:00 +0300
Subject: net: Convert unix_net_ops

These pernet_operations are just create and destroy
/proc and sysctl entries, and are not touched by
foreign pernet_operations.

So, we are able to make them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 723698416242..e3eb8806b3e4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2913,6 +2913,7 @@ static void __net_exit unix_net_exit(struct net *net)
 static struct pernet_operations unix_net_ops = {
 	.init = unix_net_init,
 	.exit = unix_net_exit,
+	.async = true,
 };
 
 static int __init af_unix_init(void)
-- 
cgit v1.2.3


From cb5e3400e78598e1eb872954516a02ba85926d84 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:30:08 +0300
Subject: net: Convert packet_net_ops

These pernet_operations just create and destroy /proc entry,
and another operations do not touch it.

Also, nobody else are interested in foreign net::packet::sklist.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/packet/af_packet.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 616cb9c18f88..2c5a6fe5d749 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4557,6 +4557,7 @@ static void __net_exit packet_net_exit(struct net *net)
 static struct pernet_operations packet_net_ops = {
 	.init = packet_net_init,
 	.exit = packet_net_exit,
+	.async = true,
 };
 
 
-- 
cgit v1.2.3


From 22769a2a6e93a17d08e1cd7a2de2749088887b87 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:30:18 +0300
Subject: net: Convert ipv4_sysctl_ops

These pernet_operations create and destroy sysctl,
which are not touched by anybody else.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 93e172118a94..89683d868b37 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1219,6 +1219,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
 static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
 	.init = ipv4_sysctl_init_net,
 	.exit = ipv4_sysctl_exit_net,
+	.async = true,
 };
 
 static __init int sysctl_ipv4_init(void)
-- 
cgit v1.2.3


From 0bc9be67185ec90feedc971af6e7b84ab932703a Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:30:27 +0300
Subject: net: Convert addrconf_ops

These pernet_operations (un)register sysctl, which
are not touched by anybody else.

So, it's safe to make them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index e1846b97ee69..8c17f8d8d5d9 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6550,6 +6550,7 @@ static void __net_exit addrconf_exit_net(struct net *net)
 static struct pernet_operations addrconf_ops = {
 	.init = addrconf_init_net,
 	.exit = addrconf_exit_net,
+	.async = true,
 };
 
 static struct rtnl_af_ops inet6_ops __read_mostly = {
-- 
cgit v1.2.3


From 2608e6b7adc8b07194b855e2102d6f1a277e3f03 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:30:42 +0300
Subject: net: Convert default_device_ops

These pernet operations consist of exit() and exit_batch() methods.

default_device_exit() moves not-local and virtual devices to init_net.
There is nothing exciting, because this may happen in any time
on a working system, and rtnl_lock() and synchronize_net() protect
us from all cases of external dereference.

The same for default_device_exit_batch(). Similar unregisteration
may happen in any time on a system. Here several lists (like todo_list),
which are accessed under rtnl_lock(). After rtnl_unlock() and
netdev_run_todo() all the devices are flushed.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index dc7506f00a66..df5241c8eda1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8934,6 +8934,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
 static struct pernet_operations __net_initdata default_device_ops = {
 	.exit = default_device_exit,
 	.exit_batch = default_device_exit_batch,
+	.async = true,
 };
 
 /*
-- 
cgit v1.2.3


From 59a513587ac09359875d6074778f21a3c01754e0 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:30:52 +0300
Subject: net: Convert diag_net_ops

These pernet operations just create and destroy netlink
socket. The socket is pernet and else operations don't
touch it.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock_diag.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 146b50e30659..aee5642affd9 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -328,6 +328,7 @@ static void __net_exit diag_net_exit(struct net *net)
 static struct pernet_operations diag_net_ops = {
 	.init = diag_net_init,
 	.exit = diag_net_exit,
+	.async = true,
 };
 
 static int __init sock_diag_init(void)
-- 
cgit v1.2.3


From b86b47a39598cdf17e0e826ebe1be21c798112cf Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Feb 2018 12:31:01 +0300
Subject: net: Convert netlink_tap_net_ops

These pernet_operations init just allocated net memory,
and they obviously can be executed in parallel in any
others.

v3: New

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b3065908e146..63cb55d3c2fd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -253,6 +253,7 @@ static struct pernet_operations netlink_tap_net_ops = {
 	.exit = netlink_tap_exit_net,
 	.id   = &netlink_tap_net_id,
 	.size = sizeof(struct netlink_tap_net),
+	.async = true,
 };
 
 static bool netlink_filter_tap(const struct sk_buff *skb)
-- 
cgit v1.2.3


From ff22b5bf78fe2a9f2a26950cc2ea180180bef47f Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 13 Feb 2018 19:33:20 +0800
Subject: sctp: rename sctp_diag.c as diag.c

Remove 'sctp_' prefix for diag file, to keep consistent with other
files' names.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/Makefile    |   2 +
 net/sctp/diag.c      | 526 +++++++++++++++++++++++++++++++++++++++++++++++++++
 net/sctp/sctp_diag.c | 526 ---------------------------------------------------
 3 files changed, 528 insertions(+), 526 deletions(-)
 create mode 100644 net/sctp/diag.c
 delete mode 100644 net/sctp/sctp_diag.c

(limited to 'net')

diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 6776582ec449..e845e4588535 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -15,6 +15,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
 	  stream_sched_rr.o stream_interleave.o
 
+sctp_diag-y := diag.o
+
 sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o
 sctp-$(CONFIG_PROC_FS) += proc.o
 sctp-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
new file mode 100644
index 000000000000..a72a7d925d46
--- /dev/null
+++ b/net/sctp/diag.c
@@ -0,0 +1,526 @@
+#include <linux/module.h>
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+#include <net/sctp/sctp.h>
+
+static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			       void *info);
+
+/* define some functions to make asoc/ep fill look clean */
+static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r,
+					struct sock *sk,
+					struct sctp_association *asoc)
+{
+	union sctp_addr laddr, paddr;
+	struct dst_entry *dst;
+	struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer;
+
+	laddr = list_entry(asoc->base.bind_addr.address_list.next,
+			   struct sctp_sockaddr_entry, list)->a;
+	paddr = asoc->peer.primary_path->ipaddr;
+	dst = asoc->peer.primary_path->dst;
+
+	r->idiag_family = sk->sk_family;
+	r->id.idiag_sport = htons(asoc->base.bind_addr.port);
+	r->id.idiag_dport = htons(asoc->peer.port);
+	r->id.idiag_if = dst ? dst->dev->ifindex : 0;
+	sock_diag_save_cookie(sk, r->id.idiag_cookie);
+
+#if IS_ENABLED(CONFIG_IPV6)
+	if (sk->sk_family == AF_INET6) {
+		*(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr;
+		*(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr;
+	} else
+#endif
+	{
+		memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+		memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+
+		r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr;
+		r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr;
+	}
+
+	r->idiag_state = asoc->state;
+	if (timer_pending(t3_rtx)) {
+		r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
+		r->idiag_retrans = asoc->rtx_data_chunks;
+		r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies);
+	} else {
+		r->idiag_timer = 0;
+		r->idiag_retrans = 0;
+		r->idiag_expires = 0;
+	}
+}
+
+static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
+					 struct list_head *address_list)
+{
+	struct sctp_sockaddr_entry *laddr;
+	int addrlen = sizeof(struct sockaddr_storage);
+	int addrcnt = 0;
+	struct nlattr *attr;
+	void *info = NULL;
+
+	list_for_each_entry_rcu(laddr, address_list, list)
+		addrcnt++;
+
+	attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt);
+	if (!attr)
+		return -EMSGSIZE;
+
+	info = nla_data(attr);
+	list_for_each_entry_rcu(laddr, address_list, list) {
+		memcpy(info, &laddr->a, sizeof(laddr->a));
+		memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a));
+		info += addrlen;
+	}
+
+	return 0;
+}
+
+static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb,
+					struct sctp_association *asoc)
+{
+	int addrlen = sizeof(struct sockaddr_storage);
+	struct sctp_transport *from;
+	struct nlattr *attr;
+	void *info = NULL;
+
+	attr = nla_reserve(skb, INET_DIAG_PEERS,
+			   addrlen * asoc->peer.transport_count);
+	if (!attr)
+		return -EMSGSIZE;
+
+	info = nla_data(attr);
+	list_for_each_entry(from, &asoc->peer.transport_addr_list,
+			    transports) {
+		memcpy(info, &from->ipaddr, sizeof(from->ipaddr));
+		memset(info + sizeof(from->ipaddr), 0,
+		       addrlen - sizeof(from->ipaddr));
+		info += addrlen;
+	}
+
+	return 0;
+}
+
+/* sctp asoc/ep fill*/
+static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
+			       struct sk_buff *skb,
+			       const struct inet_diag_req_v2 *req,
+			       struct user_namespace *user_ns,
+			       int portid, u32 seq, u16 nlmsg_flags,
+			       const struct nlmsghdr *unlh,
+			       bool net_admin)
+{
+	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+	struct list_head *addr_list;
+	struct inet_diag_msg *r;
+	struct nlmsghdr  *nlh;
+	int ext = req->idiag_ext;
+	struct sctp_infox infox;
+	void *info = NULL;
+
+	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
+			nlmsg_flags);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	r = nlmsg_data(nlh);
+	BUG_ON(!sk_fullsock(sk));
+
+	if (asoc) {
+		inet_diag_msg_sctpasoc_fill(r, sk, asoc);
+	} else {
+		inet_diag_msg_common_fill(r, sk);
+		r->idiag_state = sk->sk_state;
+		r->idiag_timer = 0;
+		r->idiag_retrans = 0;
+	}
+
+	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
+		goto errout;
+
+	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
+		u32 mem[SK_MEMINFO_VARS];
+		int amt;
+
+		if (asoc && asoc->ep->sndbuf_policy)
+			amt = asoc->sndbuf_used;
+		else
+			amt = sk_wmem_alloc_get(sk);
+		mem[SK_MEMINFO_WMEM_ALLOC] = amt;
+		if (asoc && asoc->ep->rcvbuf_policy)
+			amt = atomic_read(&asoc->rmem_alloc);
+		else
+			amt = sk_rmem_alloc_get(sk);
+		mem[SK_MEMINFO_RMEM_ALLOC] = amt;
+		mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+		mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+		mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+		mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+		mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+		mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
+		mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+
+		if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0)
+			goto errout;
+	}
+
+	if (ext & (1 << (INET_DIAG_INFO - 1))) {
+		struct nlattr *attr;
+
+		attr = nla_reserve_64bit(skb, INET_DIAG_INFO,
+					 sizeof(struct sctp_info),
+					 INET_DIAG_PAD);
+		if (!attr)
+			goto errout;
+
+		info = nla_data(attr);
+	}
+	infox.sctpinfo = (struct sctp_info *)info;
+	infox.asoc = asoc;
+	sctp_diag_get_info(sk, r, &infox);
+
+	addr_list = asoc ? &asoc->base.bind_addr.address_list
+			 : &ep->base.bind_addr.address_list;
+	if (inet_diag_msg_sctpladdrs_fill(skb, addr_list))
+		goto errout;
+
+	if (asoc && (ext & (1 << (INET_DIAG_CONG - 1))))
+		if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0)
+			goto errout;
+
+	if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc))
+		goto errout;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+
+errout:
+	nlmsg_cancel(skb, nlh);
+	return -EMSGSIZE;
+}
+
+/* callback and param */
+struct sctp_comm_param {
+	struct sk_buff *skb;
+	struct netlink_callback *cb;
+	const struct inet_diag_req_v2 *r;
+	const struct nlmsghdr *nlh;
+	bool net_admin;
+};
+
+static size_t inet_assoc_attr_size(struct sctp_association *asoc)
+{
+	int addrlen = sizeof(struct sockaddr_storage);
+	int addrcnt = 0;
+	struct sctp_sockaddr_entry *laddr;
+
+	list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list,
+				list)
+		addrcnt++;
+
+	return	  nla_total_size(sizeof(struct sctp_info))
+		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
+		+ nla_total_size(1) /* INET_DIAG_TOS */
+		+ nla_total_size(1) /* INET_DIAG_TCLASS */
+		+ nla_total_size(4) /* INET_DIAG_MARK */
+		+ nla_total_size(addrlen * asoc->peer.transport_count)
+		+ nla_total_size(addrlen * addrcnt)
+		+ nla_total_size(sizeof(struct inet_diag_meminfo))
+		+ nla_total_size(sizeof(struct inet_diag_msg))
+		+ 64;
+}
+
+static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
+{
+	struct sctp_association *assoc = tsp->asoc;
+	struct sock *sk = tsp->asoc->base.sk;
+	struct sctp_comm_param *commp = p;
+	struct sk_buff *in_skb = commp->skb;
+	const struct inet_diag_req_v2 *req = commp->r;
+	const struct nlmsghdr *nlh = commp->nlh;
+	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *rep;
+	int err;
+
+	err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
+	if (err)
+		goto out;
+
+	err = -ENOMEM;
+	rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	lock_sock(sk);
+	if (sk != assoc->base.sk) {
+		release_sock(sk);
+		sk = assoc->base.sk;
+		lock_sock(sk);
+	}
+	err = inet_sctp_diag_fill(sk, assoc, rep, req,
+				  sk_user_ns(NETLINK_CB(in_skb).sk),
+				  NETLINK_CB(in_skb).portid,
+				  nlh->nlmsg_seq, 0, nlh,
+				  commp->net_admin);
+	release_sock(sk);
+	if (err < 0) {
+		WARN_ON(err == -EMSGSIZE);
+		kfree_skb(rep);
+		goto out;
+	}
+
+	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+out:
+	return err;
+}
+
+static int sctp_sock_dump(struct sctp_transport *tsp, void *p)
+{
+	struct sctp_endpoint *ep = tsp->asoc->ep;
+	struct sctp_comm_param *commp = p;
+	struct sock *sk = ep->base.sk;
+	struct sk_buff *skb = commp->skb;
+	struct netlink_callback *cb = commp->cb;
+	const struct inet_diag_req_v2 *r = commp->r;
+	struct sctp_association *assoc;
+	int err = 0;
+
+	lock_sock(sk);
+	list_for_each_entry(assoc, &ep->asocs, asocs) {
+		if (cb->args[4] < cb->args[1])
+			goto next;
+
+		if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) &&
+		    r->id.idiag_sport)
+			goto next;
+		if (r->id.idiag_dport != htons(assoc->peer.port) &&
+		    r->id.idiag_dport)
+			goto next;
+
+		if (!cb->args[3] &&
+		    inet_sctp_diag_fill(sk, NULL, skb, r,
+					sk_user_ns(NETLINK_CB(cb->skb).sk),
+					NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq,
+					NLM_F_MULTI, cb->nlh,
+					commp->net_admin) < 0) {
+			err = 1;
+			goto release;
+		}
+		cb->args[3] = 1;
+
+		if (inet_sctp_diag_fill(sk, assoc, skb, r,
+					sk_user_ns(NETLINK_CB(cb->skb).sk),
+					NETLINK_CB(cb->skb).portid,
+					cb->nlh->nlmsg_seq, 0, cb->nlh,
+					commp->net_admin) < 0) {
+			err = 1;
+			goto release;
+		}
+next:
+		cb->args[4]++;
+	}
+	cb->args[1] = 0;
+	cb->args[3] = 0;
+	cb->args[4] = 0;
+release:
+	release_sock(sk);
+	return err;
+}
+
+static int sctp_sock_filter(struct sctp_transport *tsp, void *p)
+{
+	struct sctp_endpoint *ep = tsp->asoc->ep;
+	struct sctp_comm_param *commp = p;
+	struct sock *sk = ep->base.sk;
+	const struct inet_diag_req_v2 *r = commp->r;
+	struct sctp_association *assoc =
+		list_entry(ep->asocs.next, struct sctp_association, asocs);
+
+	/* find the ep only once through the transports by this condition */
+	if (tsp->asoc != assoc)
+		return 0;
+
+	if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
+		return 0;
+
+	return 1;
+}
+
+static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
+{
+	struct sctp_comm_param *commp = p;
+	struct sock *sk = ep->base.sk;
+	struct sk_buff *skb = commp->skb;
+	struct netlink_callback *cb = commp->cb;
+	const struct inet_diag_req_v2 *r = commp->r;
+	struct net *net = sock_net(skb->sk);
+	struct inet_sock *inet = inet_sk(sk);
+	int err = 0;
+
+	if (!net_eq(sock_net(sk), net))
+		goto out;
+
+	if (cb->args[4] < cb->args[1])
+		goto next;
+
+	if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs))
+		goto next;
+
+	if (r->sdiag_family != AF_UNSPEC &&
+	    sk->sk_family != r->sdiag_family)
+		goto next;
+
+	if (r->id.idiag_sport != inet->inet_sport &&
+	    r->id.idiag_sport)
+		goto next;
+
+	if (r->id.idiag_dport != inet->inet_dport &&
+	    r->id.idiag_dport)
+		goto next;
+
+	if (inet_sctp_diag_fill(sk, NULL, skb, r,
+				sk_user_ns(NETLINK_CB(cb->skb).sk),
+				NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				cb->nlh, commp->net_admin) < 0) {
+		err = 2;
+		goto out;
+	}
+next:
+	cb->args[4]++;
+out:
+	return err;
+}
+
+/* define the functions for sctp_diag_handler*/
+static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			       void *info)
+{
+	struct sctp_infox *infox = (struct sctp_infox *)info;
+
+	if (infox->asoc) {
+		r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc);
+		r->idiag_wqueue = infox->asoc->sndbuf_used;
+	} else {
+		r->idiag_rqueue = sk->sk_ack_backlog;
+		r->idiag_wqueue = sk->sk_max_ack_backlog;
+	}
+	if (infox->sctpinfo)
+		sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
+}
+
+static int sctp_diag_dump_one(struct sk_buff *in_skb,
+			      const struct nlmsghdr *nlh,
+			      const struct inet_diag_req_v2 *req)
+{
+	struct net *net = sock_net(in_skb->sk);
+	union sctp_addr laddr, paddr;
+	struct sctp_comm_param commp = {
+		.skb = in_skb,
+		.r = req,
+		.nlh = nlh,
+		.net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
+	};
+
+	if (req->sdiag_family == AF_INET) {
+		laddr.v4.sin_port = req->id.idiag_sport;
+		laddr.v4.sin_addr.s_addr = req->id.idiag_src[0];
+		laddr.v4.sin_family = AF_INET;
+
+		paddr.v4.sin_port = req->id.idiag_dport;
+		paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0];
+		paddr.v4.sin_family = AF_INET;
+	} else {
+		laddr.v6.sin6_port = req->id.idiag_sport;
+		memcpy(&laddr.v6.sin6_addr, req->id.idiag_src,
+		       sizeof(laddr.v6.sin6_addr));
+		laddr.v6.sin6_family = AF_INET6;
+
+		paddr.v6.sin6_port = req->id.idiag_dport;
+		memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst,
+		       sizeof(paddr.v6.sin6_addr));
+		paddr.v6.sin6_family = AF_INET6;
+	}
+
+	return sctp_transport_lookup_process(sctp_tsp_dump_one,
+					     net, &laddr, &paddr, &commp);
+}
+
+static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			   const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+	u32 idiag_states = r->idiag_states;
+	struct net *net = sock_net(skb->sk);
+	struct sctp_comm_param commp = {
+		.skb = skb,
+		.cb = cb,
+		.r = r,
+		.net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
+	};
+	int pos = cb->args[2];
+
+	/* eps hashtable dumps
+	 * args:
+	 * 0 : if it will traversal listen sock
+	 * 1 : to record the sock pos of this time's traversal
+	 * 4 : to work as a temporary variable to traversal list
+	 */
+	if (cb->args[0] == 0) {
+		if (!(idiag_states & TCPF_LISTEN))
+			goto skip;
+		if (sctp_for_each_endpoint(sctp_ep_dump, &commp))
+			goto done;
+skip:
+		cb->args[0] = 1;
+		cb->args[1] = 0;
+		cb->args[4] = 0;
+	}
+
+	/* asocs by transport hashtable dump
+	 * args:
+	 * 1 : to record the assoc pos of this time's traversal
+	 * 2 : to record the transport pos of this time's traversal
+	 * 3 : to mark if we have dumped the ep info of the current asoc
+	 * 4 : to work as a temporary variable to traversal list
+	 * 5 : to save the sk we get from travelsing the tsp list.
+	 */
+	if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
+		goto done;
+
+	sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump,
+				net, &pos, &commp);
+	cb->args[2] = pos;
+
+done:
+	cb->args[1] = cb->args[4];
+	cb->args[4] = 0;
+}
+
+static const struct inet_diag_handler sctp_diag_handler = {
+	.dump		 = sctp_diag_dump,
+	.dump_one	 = sctp_diag_dump_one,
+	.idiag_get_info  = sctp_diag_get_info,
+	.idiag_type	 = IPPROTO_SCTP,
+	.idiag_info_size = sizeof(struct sctp_info),
+};
+
+static int __init sctp_diag_init(void)
+{
+	return inet_diag_register(&sctp_diag_handler);
+}
+
+static void __exit sctp_diag_exit(void)
+{
+	inet_diag_unregister(&sctp_diag_handler);
+}
+
+module_init(sctp_diag_init);
+module_exit(sctp_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132);
diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c
deleted file mode 100644
index a72a7d925d46..000000000000
--- a/net/sctp/sctp_diag.c
+++ /dev/null
@@ -1,526 +0,0 @@
-#include <linux/module.h>
-#include <linux/inet_diag.h>
-#include <linux/sock_diag.h>
-#include <net/sctp/sctp.h>
-
-static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
-			       void *info);
-
-/* define some functions to make asoc/ep fill look clean */
-static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r,
-					struct sock *sk,
-					struct sctp_association *asoc)
-{
-	union sctp_addr laddr, paddr;
-	struct dst_entry *dst;
-	struct timer_list *t3_rtx = &asoc->peer.primary_path->T3_rtx_timer;
-
-	laddr = list_entry(asoc->base.bind_addr.address_list.next,
-			   struct sctp_sockaddr_entry, list)->a;
-	paddr = asoc->peer.primary_path->ipaddr;
-	dst = asoc->peer.primary_path->dst;
-
-	r->idiag_family = sk->sk_family;
-	r->id.idiag_sport = htons(asoc->base.bind_addr.port);
-	r->id.idiag_dport = htons(asoc->peer.port);
-	r->id.idiag_if = dst ? dst->dev->ifindex : 0;
-	sock_diag_save_cookie(sk, r->id.idiag_cookie);
-
-#if IS_ENABLED(CONFIG_IPV6)
-	if (sk->sk_family == AF_INET6) {
-		*(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr;
-		*(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr;
-	} else
-#endif
-	{
-		memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
-		memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
-
-		r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr;
-		r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr;
-	}
-
-	r->idiag_state = asoc->state;
-	if (timer_pending(t3_rtx)) {
-		r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX;
-		r->idiag_retrans = asoc->rtx_data_chunks;
-		r->idiag_expires = jiffies_to_msecs(t3_rtx->expires - jiffies);
-	} else {
-		r->idiag_timer = 0;
-		r->idiag_retrans = 0;
-		r->idiag_expires = 0;
-	}
-}
-
-static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb,
-					 struct list_head *address_list)
-{
-	struct sctp_sockaddr_entry *laddr;
-	int addrlen = sizeof(struct sockaddr_storage);
-	int addrcnt = 0;
-	struct nlattr *attr;
-	void *info = NULL;
-
-	list_for_each_entry_rcu(laddr, address_list, list)
-		addrcnt++;
-
-	attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt);
-	if (!attr)
-		return -EMSGSIZE;
-
-	info = nla_data(attr);
-	list_for_each_entry_rcu(laddr, address_list, list) {
-		memcpy(info, &laddr->a, sizeof(laddr->a));
-		memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a));
-		info += addrlen;
-	}
-
-	return 0;
-}
-
-static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb,
-					struct sctp_association *asoc)
-{
-	int addrlen = sizeof(struct sockaddr_storage);
-	struct sctp_transport *from;
-	struct nlattr *attr;
-	void *info = NULL;
-
-	attr = nla_reserve(skb, INET_DIAG_PEERS,
-			   addrlen * asoc->peer.transport_count);
-	if (!attr)
-		return -EMSGSIZE;
-
-	info = nla_data(attr);
-	list_for_each_entry(from, &asoc->peer.transport_addr_list,
-			    transports) {
-		memcpy(info, &from->ipaddr, sizeof(from->ipaddr));
-		memset(info + sizeof(from->ipaddr), 0,
-		       addrlen - sizeof(from->ipaddr));
-		info += addrlen;
-	}
-
-	return 0;
-}
-
-/* sctp asoc/ep fill*/
-static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
-			       struct sk_buff *skb,
-			       const struct inet_diag_req_v2 *req,
-			       struct user_namespace *user_ns,
-			       int portid, u32 seq, u16 nlmsg_flags,
-			       const struct nlmsghdr *unlh,
-			       bool net_admin)
-{
-	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
-	struct list_head *addr_list;
-	struct inet_diag_msg *r;
-	struct nlmsghdr  *nlh;
-	int ext = req->idiag_ext;
-	struct sctp_infox infox;
-	void *info = NULL;
-
-	nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
-			nlmsg_flags);
-	if (!nlh)
-		return -EMSGSIZE;
-
-	r = nlmsg_data(nlh);
-	BUG_ON(!sk_fullsock(sk));
-
-	if (asoc) {
-		inet_diag_msg_sctpasoc_fill(r, sk, asoc);
-	} else {
-		inet_diag_msg_common_fill(r, sk);
-		r->idiag_state = sk->sk_state;
-		r->idiag_timer = 0;
-		r->idiag_retrans = 0;
-	}
-
-	if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
-		goto errout;
-
-	if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) {
-		u32 mem[SK_MEMINFO_VARS];
-		int amt;
-
-		if (asoc && asoc->ep->sndbuf_policy)
-			amt = asoc->sndbuf_used;
-		else
-			amt = sk_wmem_alloc_get(sk);
-		mem[SK_MEMINFO_WMEM_ALLOC] = amt;
-		if (asoc && asoc->ep->rcvbuf_policy)
-			amt = atomic_read(&asoc->rmem_alloc);
-		else
-			amt = sk_rmem_alloc_get(sk);
-		mem[SK_MEMINFO_RMEM_ALLOC] = amt;
-		mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
-		mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
-		mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
-		mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
-		mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
-		mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
-		mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
-
-		if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0)
-			goto errout;
-	}
-
-	if (ext & (1 << (INET_DIAG_INFO - 1))) {
-		struct nlattr *attr;
-
-		attr = nla_reserve_64bit(skb, INET_DIAG_INFO,
-					 sizeof(struct sctp_info),
-					 INET_DIAG_PAD);
-		if (!attr)
-			goto errout;
-
-		info = nla_data(attr);
-	}
-	infox.sctpinfo = (struct sctp_info *)info;
-	infox.asoc = asoc;
-	sctp_diag_get_info(sk, r, &infox);
-
-	addr_list = asoc ? &asoc->base.bind_addr.address_list
-			 : &ep->base.bind_addr.address_list;
-	if (inet_diag_msg_sctpladdrs_fill(skb, addr_list))
-		goto errout;
-
-	if (asoc && (ext & (1 << (INET_DIAG_CONG - 1))))
-		if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0)
-			goto errout;
-
-	if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc))
-		goto errout;
-
-	nlmsg_end(skb, nlh);
-	return 0;
-
-errout:
-	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
-}
-
-/* callback and param */
-struct sctp_comm_param {
-	struct sk_buff *skb;
-	struct netlink_callback *cb;
-	const struct inet_diag_req_v2 *r;
-	const struct nlmsghdr *nlh;
-	bool net_admin;
-};
-
-static size_t inet_assoc_attr_size(struct sctp_association *asoc)
-{
-	int addrlen = sizeof(struct sockaddr_storage);
-	int addrcnt = 0;
-	struct sctp_sockaddr_entry *laddr;
-
-	list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list,
-				list)
-		addrcnt++;
-
-	return	  nla_total_size(sizeof(struct sctp_info))
-		+ nla_total_size(1) /* INET_DIAG_SHUTDOWN */
-		+ nla_total_size(1) /* INET_DIAG_TOS */
-		+ nla_total_size(1) /* INET_DIAG_TCLASS */
-		+ nla_total_size(4) /* INET_DIAG_MARK */
-		+ nla_total_size(addrlen * asoc->peer.transport_count)
-		+ nla_total_size(addrlen * addrcnt)
-		+ nla_total_size(sizeof(struct inet_diag_meminfo))
-		+ nla_total_size(sizeof(struct inet_diag_msg))
-		+ 64;
-}
-
-static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p)
-{
-	struct sctp_association *assoc = tsp->asoc;
-	struct sock *sk = tsp->asoc->base.sk;
-	struct sctp_comm_param *commp = p;
-	struct sk_buff *in_skb = commp->skb;
-	const struct inet_diag_req_v2 *req = commp->r;
-	const struct nlmsghdr *nlh = commp->nlh;
-	struct net *net = sock_net(in_skb->sk);
-	struct sk_buff *rep;
-	int err;
-
-	err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
-	if (err)
-		goto out;
-
-	err = -ENOMEM;
-	rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL);
-	if (!rep)
-		goto out;
-
-	lock_sock(sk);
-	if (sk != assoc->base.sk) {
-		release_sock(sk);
-		sk = assoc->base.sk;
-		lock_sock(sk);
-	}
-	err = inet_sctp_diag_fill(sk, assoc, rep, req,
-				  sk_user_ns(NETLINK_CB(in_skb).sk),
-				  NETLINK_CB(in_skb).portid,
-				  nlh->nlmsg_seq, 0, nlh,
-				  commp->net_admin);
-	release_sock(sk);
-	if (err < 0) {
-		WARN_ON(err == -EMSGSIZE);
-		kfree_skb(rep);
-		goto out;
-	}
-
-	err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
-			      MSG_DONTWAIT);
-	if (err > 0)
-		err = 0;
-out:
-	return err;
-}
-
-static int sctp_sock_dump(struct sctp_transport *tsp, void *p)
-{
-	struct sctp_endpoint *ep = tsp->asoc->ep;
-	struct sctp_comm_param *commp = p;
-	struct sock *sk = ep->base.sk;
-	struct sk_buff *skb = commp->skb;
-	struct netlink_callback *cb = commp->cb;
-	const struct inet_diag_req_v2 *r = commp->r;
-	struct sctp_association *assoc;
-	int err = 0;
-
-	lock_sock(sk);
-	list_for_each_entry(assoc, &ep->asocs, asocs) {
-		if (cb->args[4] < cb->args[1])
-			goto next;
-
-		if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) &&
-		    r->id.idiag_sport)
-			goto next;
-		if (r->id.idiag_dport != htons(assoc->peer.port) &&
-		    r->id.idiag_dport)
-			goto next;
-
-		if (!cb->args[3] &&
-		    inet_sctp_diag_fill(sk, NULL, skb, r,
-					sk_user_ns(NETLINK_CB(cb->skb).sk),
-					NETLINK_CB(cb->skb).portid,
-					cb->nlh->nlmsg_seq,
-					NLM_F_MULTI, cb->nlh,
-					commp->net_admin) < 0) {
-			err = 1;
-			goto release;
-		}
-		cb->args[3] = 1;
-
-		if (inet_sctp_diag_fill(sk, assoc, skb, r,
-					sk_user_ns(NETLINK_CB(cb->skb).sk),
-					NETLINK_CB(cb->skb).portid,
-					cb->nlh->nlmsg_seq, 0, cb->nlh,
-					commp->net_admin) < 0) {
-			err = 1;
-			goto release;
-		}
-next:
-		cb->args[4]++;
-	}
-	cb->args[1] = 0;
-	cb->args[3] = 0;
-	cb->args[4] = 0;
-release:
-	release_sock(sk);
-	return err;
-}
-
-static int sctp_sock_filter(struct sctp_transport *tsp, void *p)
-{
-	struct sctp_endpoint *ep = tsp->asoc->ep;
-	struct sctp_comm_param *commp = p;
-	struct sock *sk = ep->base.sk;
-	const struct inet_diag_req_v2 *r = commp->r;
-	struct sctp_association *assoc =
-		list_entry(ep->asocs.next, struct sctp_association, asocs);
-
-	/* find the ep only once through the transports by this condition */
-	if (tsp->asoc != assoc)
-		return 0;
-
-	if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family)
-		return 0;
-
-	return 1;
-}
-
-static int sctp_ep_dump(struct sctp_endpoint *ep, void *p)
-{
-	struct sctp_comm_param *commp = p;
-	struct sock *sk = ep->base.sk;
-	struct sk_buff *skb = commp->skb;
-	struct netlink_callback *cb = commp->cb;
-	const struct inet_diag_req_v2 *r = commp->r;
-	struct net *net = sock_net(skb->sk);
-	struct inet_sock *inet = inet_sk(sk);
-	int err = 0;
-
-	if (!net_eq(sock_net(sk), net))
-		goto out;
-
-	if (cb->args[4] < cb->args[1])
-		goto next;
-
-	if (!(r->idiag_states & TCPF_LISTEN) && !list_empty(&ep->asocs))
-		goto next;
-
-	if (r->sdiag_family != AF_UNSPEC &&
-	    sk->sk_family != r->sdiag_family)
-		goto next;
-
-	if (r->id.idiag_sport != inet->inet_sport &&
-	    r->id.idiag_sport)
-		goto next;
-
-	if (r->id.idiag_dport != inet->inet_dport &&
-	    r->id.idiag_dport)
-		goto next;
-
-	if (inet_sctp_diag_fill(sk, NULL, skb, r,
-				sk_user_ns(NETLINK_CB(cb->skb).sk),
-				NETLINK_CB(cb->skb).portid,
-				cb->nlh->nlmsg_seq, NLM_F_MULTI,
-				cb->nlh, commp->net_admin) < 0) {
-		err = 2;
-		goto out;
-	}
-next:
-	cb->args[4]++;
-out:
-	return err;
-}
-
-/* define the functions for sctp_diag_handler*/
-static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
-			       void *info)
-{
-	struct sctp_infox *infox = (struct sctp_infox *)info;
-
-	if (infox->asoc) {
-		r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc);
-		r->idiag_wqueue = infox->asoc->sndbuf_used;
-	} else {
-		r->idiag_rqueue = sk->sk_ack_backlog;
-		r->idiag_wqueue = sk->sk_max_ack_backlog;
-	}
-	if (infox->sctpinfo)
-		sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo);
-}
-
-static int sctp_diag_dump_one(struct sk_buff *in_skb,
-			      const struct nlmsghdr *nlh,
-			      const struct inet_diag_req_v2 *req)
-{
-	struct net *net = sock_net(in_skb->sk);
-	union sctp_addr laddr, paddr;
-	struct sctp_comm_param commp = {
-		.skb = in_skb,
-		.r = req,
-		.nlh = nlh,
-		.net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN),
-	};
-
-	if (req->sdiag_family == AF_INET) {
-		laddr.v4.sin_port = req->id.idiag_sport;
-		laddr.v4.sin_addr.s_addr = req->id.idiag_src[0];
-		laddr.v4.sin_family = AF_INET;
-
-		paddr.v4.sin_port = req->id.idiag_dport;
-		paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0];
-		paddr.v4.sin_family = AF_INET;
-	} else {
-		laddr.v6.sin6_port = req->id.idiag_sport;
-		memcpy(&laddr.v6.sin6_addr, req->id.idiag_src,
-		       sizeof(laddr.v6.sin6_addr));
-		laddr.v6.sin6_family = AF_INET6;
-
-		paddr.v6.sin6_port = req->id.idiag_dport;
-		memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst,
-		       sizeof(paddr.v6.sin6_addr));
-		paddr.v6.sin6_family = AF_INET6;
-	}
-
-	return sctp_transport_lookup_process(sctp_tsp_dump_one,
-					     net, &laddr, &paddr, &commp);
-}
-
-static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
-			   const struct inet_diag_req_v2 *r, struct nlattr *bc)
-{
-	u32 idiag_states = r->idiag_states;
-	struct net *net = sock_net(skb->sk);
-	struct sctp_comm_param commp = {
-		.skb = skb,
-		.cb = cb,
-		.r = r,
-		.net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN),
-	};
-	int pos = cb->args[2];
-
-	/* eps hashtable dumps
-	 * args:
-	 * 0 : if it will traversal listen sock
-	 * 1 : to record the sock pos of this time's traversal
-	 * 4 : to work as a temporary variable to traversal list
-	 */
-	if (cb->args[0] == 0) {
-		if (!(idiag_states & TCPF_LISTEN))
-			goto skip;
-		if (sctp_for_each_endpoint(sctp_ep_dump, &commp))
-			goto done;
-skip:
-		cb->args[0] = 1;
-		cb->args[1] = 0;
-		cb->args[4] = 0;
-	}
-
-	/* asocs by transport hashtable dump
-	 * args:
-	 * 1 : to record the assoc pos of this time's traversal
-	 * 2 : to record the transport pos of this time's traversal
-	 * 3 : to mark if we have dumped the ep info of the current asoc
-	 * 4 : to work as a temporary variable to traversal list
-	 * 5 : to save the sk we get from travelsing the tsp list.
-	 */
-	if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE)))
-		goto done;
-
-	sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump,
-				net, &pos, &commp);
-	cb->args[2] = pos;
-
-done:
-	cb->args[1] = cb->args[4];
-	cb->args[4] = 0;
-}
-
-static const struct inet_diag_handler sctp_diag_handler = {
-	.dump		 = sctp_diag_dump,
-	.dump_one	 = sctp_diag_dump_one,
-	.idiag_get_info  = sctp_diag_get_info,
-	.idiag_type	 = IPPROTO_SCTP,
-	.idiag_info_size = sizeof(struct sctp_info),
-};
-
-static int __init sctp_diag_init(void)
-{
-	return inet_diag_register(&sctp_diag_handler);
-}
-
-static void __exit sctp_diag_exit(void)
-{
-	inet_diag_unregister(&sctp_diag_handler);
-}
-
-module_init(sctp_diag_init);
-module_exit(sctp_diag_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132);
-- 
cgit v1.2.3


From 6f68dc993afce0b580b9b1a9edadcd4b178cbb06 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 13 Feb 2018 19:33:21 +0800
Subject: sctp: add file comments in diag.c

This patch is to add the missing file comments for sctp diag file.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/diag.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'net')

diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index a72a7d925d46..078f01a8d582 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -1,3 +1,34 @@
+/* SCTP kernel implementation
+ * (C) Copyright Red Hat Inc. 2017
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * These functions implement sctp diag support.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email addresched(es):
+ *    lksctp developers <linux-sctp@vger.kernel.org>
+ *
+ * Written or modified by:
+ *    Xin Long <lucien.xin@gmail.com>
+ */
+
 #include <linux/module.h>
 #include <linux/inet_diag.h>
 #include <linux/sock_diag.h>
-- 
cgit v1.2.3


From 0d876f2c6de5b1b00c2fd38bb4f4692e720207b1 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Feb 2018 08:11:34 -0800
Subject: net/ipv4: Simplify fib_select_path

If flow oif is set and it is not an l3mdev, then fib_select_path
can jump to the source address check.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c586597da20d..0b8c0a719f3e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1760,13 +1760,11 @@ void fib_select_multipath(struct fib_result *res, int hash)
 void fib_select_path(struct net *net, struct fib_result *res,
 		     struct flowi4 *fl4, const struct sk_buff *skb)
 {
-	bool oif_check;
-
-	oif_check = (fl4->flowi4_oif == 0 ||
-		     fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
+	if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF))
+		goto check_saddr;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi->fib_nhs > 1 && oif_check) {
+	if (res->fi->fib_nhs > 1) {
 		int h = fib_multipath_hash(res->fi, fl4, skb);
 
 		fib_select_multipath(res, h);
@@ -1775,9 +1773,10 @@ void fib_select_path(struct net *net, struct fib_result *res,
 #endif
 	if (!res->prefixlen &&
 	    res->table->tb_num_default > 1 &&
-	    res->type == RTN_UNICAST && oif_check)
+	    res->type == RTN_UNICAST)
 		fib_select_default(fl4, res);
 
+check_saddr:
 	if (!fl4->saddr)
 		fl4->saddr = FIB_RES_PREFSRC(net, *res);
 }
-- 
cgit v1.2.3


From 8c2ceabe99e04005cadba739856eed6953a8a3af Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Feb 2018 08:12:12 -0800
Subject: net/ipv4: Unexport fib_multipath_hash and fib_select_path

Do not export fib_multipath_hash or fib_select_path; both are only used
by core ipv4 code.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 1 -
 net/ipv4/route.c         | 1 -
 2 files changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 0b8c0a719f3e..2b883b58354f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1780,4 +1780,3 @@ check_saddr:
 	if (!fl4->saddr)
 		fl4->saddr = FIB_RES_PREFSRC(net, *res);
 }
-EXPORT_SYMBOL_GPL(fib_select_path);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 9376ed69ffeb..b0ef4cc3e875 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1847,7 +1847,6 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
 
 	return mhash >> 1;
 }
-EXPORT_SYMBOL_GPL(fib_multipath_hash);
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
 static int ip_mkroute_input(struct sk_buff *skb,
-- 
cgit v1.2.3


From ee6f4a5ad806a13b6b26b6699bb5562c08148280 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Sun, 11 Feb 2018 22:50:46 -0500
Subject: ieee802154: 6lowpan: set IFF_NO_QUEUE

This patch sets the IFF_NO_QUEUE for IEEE 802.15.4 6lowpan interfaces. As
commit 24dcbf662205 ("6lowpan: Don't set IFF_NO_QUEUE") removes it for
"reasons" from the bluetooth 6lowpan subsystem. In IEEE 802.15.4 the lower
interface deals with one qdisc for the real hardware, 6LoWPAN does the
protocol adaption only and no second queuing on top.

Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: Stefan Schmidt <stefan@osg.samsung.com>
---
 net/ieee802154/6lowpan/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 974765b7d92a..e4f305320519 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -104,6 +104,7 @@ static void lowpan_setup(struct net_device *ldev)
 	/* We need an ipv6hdr as minimum len when calling xmit */
 	ldev->hard_header_len	= sizeof(struct ipv6hdr);
 	ldev->flags		= IFF_BROADCAST | IFF_MULTICAST;
+	ldev->priv_flags	|= IFF_NO_QUEUE;
 
 	ldev->netdev_ops	= &lowpan_netdev_ops;
 	ldev->header_ops	= &lowpan_header_ops;
-- 
cgit v1.2.3


From 330c7272c40e965b8ab510d1022acd6e6a32e9c8 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Feb 2018 08:52:00 -0800
Subject: net: Make dn_ptr depend on CONFIG_DECNET

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index df5241c8eda1..4bd4ad7ffda4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8134,8 +8134,9 @@ void netdev_run_todo(void)
 		BUG_ON(!list_empty(&dev->ptype_specific));
 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
+#if IS_ENABLED(CONFIG_DECNET)
 		WARN_ON(dev->dn_ptr);
-
+#endif
 		if (dev->priv_destructor)
 			dev->priv_destructor(dev);
 		if (dev->needs_free_netdev)
-- 
cgit v1.2.3


From e92bad5034922776bd5e64e3db739d7607eb7e29 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Feb 2018 08:52:03 -0800
Subject: net: Remove atalk header from socket.c

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index fac8246a8ae8..d83e804d5e65 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -104,7 +104,6 @@
 #include <linux/ipv6_route.h>
 #include <linux/route.h>
 #include <linux/sockios.h>
-#include <linux/atalk.h>
 #include <net/busy_poll.h>
 #include <linux/errqueue.h>
 
-- 
cgit v1.2.3


From e0f9759f530bf789e984961dce79f525b151ecf3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 13 Feb 2018 06:14:12 -0800
Subject: tcp: try to keep packet if SYN_RCV race is lost
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

배석진 reported that in some situations, packets for a given 5-tuple
end up being processed by different CPUS.

This involves RPS, and fragmentation.

배석진 is seeing packet drops when a SYN_RECV request socket is
moved into ESTABLISH state. Other states are protected by socket lock.

This is caused by a CPU losing the race, and simply not caring enough.

Since this seems to occur frequently, we can do better and perform
a second lookup.

Note that all needed memory barriers are already in the existing code,
thanks to the spin_lock()/spin_unlock() pair in inet_ehash_insert()
and reqsk_put(). The second lookup must find the new socket,
unless it has already been accepted and closed by another cpu.

Note that the fragmentation could be avoided in the first place by
use of a correct TCP MSS option in the SYN{ACK} packet, but this
does not mean we can not be more robust.

Many thanks to 배석진 for a very detailed analysis.

Reported-by: 배석진 <soukjin.bae@samsung.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c     |  4 +++-
 net/ipv4/tcp_ipv4.c      | 13 ++++++++++++-
 net/ipv4/tcp_minisocks.c |  3 ++-
 net/ipv6/tcp_ipv6.c      | 13 ++++++++++++-
 4 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 575d3c1fb6e8..a6b48f6253e3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5870,10 +5870,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
 	tp->rx_opt.saw_tstamp = 0;
 	req = tp->fastopen_rsk;
 	if (req) {
+		bool req_stolen;
+
 		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
 		    sk->sk_state != TCP_FIN_WAIT1);
 
-		if (!tcp_check_req(sk, skb, req, true))
+		if (!tcp_check_req(sk, skb, req, true, &req_stolen))
 			goto discard;
 	}
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ac16795486ea..f3e52bc98980 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1672,6 +1672,7 @@ process:
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 		struct request_sock *req = inet_reqsk(sk);
+		bool req_stolen = false;
 		struct sock *nsk;
 
 		sk = req->rsk_listener;
@@ -1694,10 +1695,20 @@ process:
 			th = (const struct tcphdr *)skb->data;
 			iph = ip_hdr(skb);
 			tcp_v4_fill_cb(skb, iph, th);
-			nsk = tcp_check_req(sk, skb, req, false);
+			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
 		}
 		if (!nsk) {
 			reqsk_put(req);
+			if (req_stolen) {
+				/* Another cpu got exclusive access to req
+				 * and created a full blown socket.
+				 * Try to feed this packet to this socket
+				 * instead of discarding it.
+				 */
+				tcp_v4_restore_cb(skb);
+				sock_put(sk);
+				goto lookup;
+			}
 			goto discard_and_relse;
 		}
 		if (nsk == sk) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a8384b0c11f8..e7e36433cdb5 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -578,7 +578,7 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
 
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   struct request_sock *req,
-			   bool fastopen)
+			   bool fastopen, bool *req_stolen)
 {
 	struct tcp_options_received tmp_opt;
 	struct sock *child;
@@ -785,6 +785,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	sock_rps_save_rxhash(child, skb);
 	tcp_synack_rtt_meas(child, req);
+	*req_stolen = !own_req;
 	return inet_csk_complete_hashdance(sk, child, req, own_req);
 
 listen_overflow:
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 412139f4eccd..883df0ad5bfe 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1451,6 +1451,7 @@ process:
 
 	if (sk->sk_state == TCP_NEW_SYN_RECV) {
 		struct request_sock *req = inet_reqsk(sk);
+		bool req_stolen = false;
 		struct sock *nsk;
 
 		sk = req->rsk_listener;
@@ -1470,10 +1471,20 @@ process:
 			th = (const struct tcphdr *)skb->data;
 			hdr = ipv6_hdr(skb);
 			tcp_v6_fill_cb(skb, hdr, th);
-			nsk = tcp_check_req(sk, skb, req, false);
+			nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
 		}
 		if (!nsk) {
 			reqsk_put(req);
+			if (req_stolen) {
+				/* Another cpu got exclusive access to req
+				 * and created a full blown socket.
+				 * Try to feed this packet to this socket
+				 * instead of discarding it.
+				 */
+				tcp_v6_restore_cb(skb);
+				sock_put(sk);
+				goto lookup;
+			}
 			goto discard_and_relse;
 		}
 		if (nsk == sk) {
-- 
cgit v1.2.3


From 0336369d3a4d65c9332476b618ff3bb9b41045e1 Mon Sep 17 00:00:00 2001
From: Brandon Streiff <brandon.streiff@ni.com>
Date: Wed, 14 Feb 2018 01:07:48 +0100
Subject: net: dsa: forward hardware timestamping ioctls to switch driver

This patch adds support to the dsa slave network device so that
switch drivers can implement the SIOC[GS]HWTSTAMP ioctls and the
ethtool timestamp-info interface.

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/slave.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'net')

diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index f52307296de4..935d93f0d36c 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -255,6 +255,22 @@ dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 
 static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+	int port = p->dp->index;
+
+	/* Pass through to switch driver if it supports timestamping */
+	switch (cmd) {
+	case SIOCGHWTSTAMP:
+		if (ds->ops->port_hwtstamp_get)
+			return ds->ops->port_hwtstamp_get(ds, port, ifr);
+		break;
+	case SIOCSHWTSTAMP:
+		if (ds->ops->port_hwtstamp_set)
+			return ds->ops->port_hwtstamp_set(ds, port, ifr);
+		break;
+	}
+
 	if (!dev->phydev)
 		return -ENODEV;
 
@@ -918,6 +934,18 @@ static int dsa_slave_set_rxnfc(struct net_device *dev,
 	return ds->ops->set_rxnfc(ds, dp->index, nfc);
 }
 
+static int dsa_slave_get_ts_info(struct net_device *dev,
+				 struct ethtool_ts_info *ts)
+{
+	struct dsa_slave_priv *p = netdev_priv(dev);
+	struct dsa_switch *ds = p->dp->ds;
+
+	if (!ds->ops->get_ts_info)
+		return -EOPNOTSUPP;
+
+	return ds->ops->get_ts_info(ds, p->dp->index, ts);
+}
+
 static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.get_drvinfo		= dsa_slave_get_drvinfo,
 	.get_regs_len		= dsa_slave_get_regs_len,
@@ -938,6 +966,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
 	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
 	.get_rxnfc		= dsa_slave_get_rxnfc,
 	.set_rxnfc		= dsa_slave_set_rxnfc,
+	.get_ts_info		= dsa_slave_get_ts_info,
 };
 
 /* legacy way, bypassing the bridge *****************************************/
-- 
cgit v1.2.3


From 90af1059c52c0031f3bfd8279c9ede153ca83275 Mon Sep 17 00:00:00 2001
From: Brandon Streiff <brandon.streiff@ni.com>
Date: Wed, 14 Feb 2018 01:07:49 +0100
Subject: net: dsa: forward timestamping callbacks to switch drivers

Forward the rx/tx timestamp machinery from the dsa infrastructure to the
switch driver.

On the rx side, defer delivery of skbs until we have an rx timestamp.
This mimicks the behavior of skb_defer_rx_timestamp.

On the tx side, identify PTP packets, clone them, and pass them to the
underlying switch driver before we transmit. This mimicks the behavior
of skb_tx_timestamp.

Adjusted txstamp API to keep the allocation and freeing of the clone
in the same central function by Richard Cochran

Signed-off-by: Brandon Streiff <brandon.streiff@ni.com>
Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/dsa.c   | 36 ++++++++++++++++++++++++++++++++++++
 net/dsa/slave.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

(limited to 'net')

diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 6a9d0f50fbee..e63c554e0623 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/sysfs.h>
 #include <linux/phy_fixed.h>
+#include <linux/ptp_classify.h>
 #include <linux/gpio/consumer.h>
 #include <linux/etherdevice.h>
 
@@ -122,6 +123,38 @@ struct net_device *dsa_dev_to_net_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 
+/* Determine if we should defer delivery of skb until we have a rx timestamp.
+ *
+ * Called from dsa_switch_rcv. For now, this will only work if tagging is
+ * enabled on the switch. Normally the MAC driver would retrieve the hardware
+ * timestamp when it reads the packet out of the hardware. However in a DSA
+ * switch, the DSA driver owning the interface to which the packet is
+ * delivered is never notified unless we do so here.
+ */
+static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
+				       struct sk_buff *skb)
+{
+	struct dsa_switch *ds = p->dp->ds;
+	unsigned int type;
+
+	if (skb_headroom(skb) < ETH_HLEN)
+		return false;
+
+	__skb_push(skb, ETH_HLEN);
+
+	type = ptp_classify_raw(skb);
+
+	__skb_pull(skb, ETH_HLEN);
+
+	if (type == PTP_CLASS_NONE)
+		return false;
+
+	if (likely(ds->ops->port_rxtstamp))
+		return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
+
+	return false;
+}
+
 static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 			  struct packet_type *pt, struct net_device *unused)
 {
@@ -157,6 +190,9 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
 	s->rx_bytes += skb->len;
 	u64_stats_update_end(&s->syncp);
 
+	if (dsa_skb_defer_rx_timestamp(p, skb))
+		return 0;
+
 	netif_receive_skb(skb);
 
 	return 0;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 935d93f0d36c..3376dad6dcfd 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -21,6 +21,7 @@
 #include <net/tc_act/tc_mirred.h>
 #include <linux/if_bridge.h>
 #include <linux/netpoll.h>
+#include <linux/ptp_classify.h>
 
 #include "dsa_priv.h"
 
@@ -401,6 +402,30 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
 	return NETDEV_TX_OK;
 }
 
+static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
+				 struct sk_buff *skb)
+{
+	struct dsa_switch *ds = p->dp->ds;
+	struct sk_buff *clone;
+	unsigned int type;
+
+	type = ptp_classify_raw(skb);
+	if (type == PTP_CLASS_NONE)
+		return;
+
+	if (!ds->ops->port_txtstamp)
+		return;
+
+	clone = skb_clone_sk(skb);
+	if (!clone)
+		return;
+
+	if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type))
+		return;
+
+	kfree_skb(clone);
+}
+
 static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
@@ -413,6 +438,11 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 	s->tx_bytes += skb->len;
 	u64_stats_update_end(&s->syncp);
 
+	/* Identify PTP protocol packets, clone them, and pass them to the
+	 * switch driver
+	 */
+	dsa_skb_tx_timestamp(p, skb);
+
 	/* Transmit function may have to reallocate the original SKB,
 	 * in which case it must have freed it. Only free it here on error.
 	 */
-- 
cgit v1.2.3


From 9942895b5ee4b0db53f32fbcb4a51360607aac1b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Feb 2018 20:32:04 -0800
Subject: net: Move ipv4 set_lwt_redirect helper to lwtunnel

IPv4 uses set_lwt_redirect to set the lwtunnel redirect functions as
needed. Move it to lwtunnel.h as lwtunnel_set_redirect and change
IPv6 to also use it.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 17 ++---------------
 net/ipv6/route.c |  9 +--------
 2 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b0ef4cc3e875..6ce623e3e2ab 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1645,19 +1645,6 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
 	spin_unlock_bh(&fnhe_lock);
 }
 
-static void set_lwt_redirect(struct rtable *rth)
-{
-	if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_output = rth->dst.output;
-		rth->dst.output = lwtunnel_output;
-	}
-
-	if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
-		rth->dst.lwtstate->orig_input = rth->dst.input;
-		rth->dst.input = lwtunnel_input;
-	}
-}
-
 /* called in rcu_read_lock() section */
 static int __mkroute_input(struct sk_buff *skb,
 			   const struct fib_result *res,
@@ -1748,7 +1735,7 @@ rt_cache:
 
 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
 		       do_cache);
-	set_lwt_redirect(rth);
+	lwtunnel_set_redirect(&rth->dst);
 	skb_dst_set(skb, &rth->dst);
 out:
 	err = 0;
@@ -2267,7 +2254,7 @@ add:
 	}
 
 	rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
-	set_lwt_redirect(rth);
+	lwtunnel_set_redirect(&rth->dst);
 
 	return rth;
 }
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 9dcfadddd800..f0baae26db8f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2671,14 +2671,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 		if (err)
 			goto out;
 		rt->dst.lwtstate = lwtstate_get(lwtstate);
-		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
-			rt->dst.lwtstate->orig_output = rt->dst.output;
-			rt->dst.output = lwtunnel_output;
-		}
-		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
-			rt->dst.lwtstate->orig_input = rt->dst.input;
-			rt->dst.input = lwtunnel_input;
-		}
+		lwtunnel_set_redirect(&rt->dst);
 	}
 
 	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
-- 
cgit v1.2.3


From 37c64cf63ba1f9c071b37a2129ae9860fd423d6c Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Wed, 14 Feb 2018 13:34:39 +0100
Subject: tipc: apply bearer link tolerance on running links

Currently, the default link tolerance set in struct tipc_bearer only
has effect on links going up after that moment. I.e., a user has to
reset all the node's links across that bearer to have the new value
applied. This is too limiting and disturbing on a running cluster to
be useful.

We now change this so that also already existing links are updated
dynamically, without any need for a reset, when the bearer value is
changed. We leverage the already existing per-link functionality
for this to achieve the wanted effect.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c |  8 +++++---
 net/tipc/link.c   |  3 ++-
 net/tipc/node.c   | 24 ++++++++++++++++++++++++
 net/tipc/node.h   |  1 +
 4 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index c8001471da6c..83d284feab1a 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -946,11 +946,11 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info)
 
 int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
 {
-	int err;
-	char *name;
 	struct tipc_bearer *b;
 	struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
 	struct net *net = sock_net(skb->sk);
+	char *name;
+	int err;
 
 	if (!info->attrs[TIPC_NLA_BEARER])
 		return -EINVAL;
@@ -982,8 +982,10 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info)
 			return err;
 		}
 
-		if (props[TIPC_NLA_PROP_TOL])
+		if (props[TIPC_NLA_PROP_TOL]) {
 			b->tolerance = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
+			tipc_node_apply_tolerance(net, b);
+		}
 		if (props[TIPC_NLA_PROP_PRIO])
 			b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
 		if (props[TIPC_NLA_PROP_WIN])
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 2d6b2aed30e0..3c230466804d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2126,7 +2126,8 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
 			     struct sk_buff_head *xmitq)
 {
 	l->tolerance = tol;
-	tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq);
+	if (link_is_up(l))
+		tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq);
 }
 
 void tipc_link_set_prio(struct tipc_link *l, u32 prio,
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 9036d8756e73..389193d7cf67 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1618,6 +1618,30 @@ discard:
 	kfree_skb(skb);
 }
 
+void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b)
+{
+	struct tipc_net *tn = tipc_net(net);
+	int bearer_id = b->identity;
+	struct sk_buff_head xmitq;
+	struct tipc_link_entry *e;
+	struct tipc_node *n;
+
+	__skb_queue_head_init(&xmitq);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(n, &tn->node_list, list) {
+		tipc_node_write_lock(n);
+		e = &n->links[bearer_id];
+		if (e->link)
+			tipc_link_set_tolerance(e->link, b->tolerance, &xmitq);
+		tipc_node_write_unlock(n);
+		tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr);
+	}
+
+	rcu_read_unlock();
+}
+
 int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net *net = sock_net(skb->sk);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index acd58d23a70e..4ce5e3a185c0 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -65,6 +65,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
 			  struct tipc_media_addr *maddr,
 			  bool *respond, bool *dupl_addr);
 void tipc_node_delete_links(struct net *net, int bearer_id);
+void tipc_node_apply_tolerance(struct net *net, struct tipc_bearer *b);
 int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
 			   char *linkname, size_t len);
 int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
-- 
cgit v1.2.3


From dff8baa261174de689a44572d0ea182d7aa70598 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@quantonium.net>
Date: Wed, 14 Feb 2018 09:22:42 -0800
Subject: kcm: Call strp_stop before strp_done in kcm_attach

In kcm_attach strp_done is called when sk_user_data is already
set to fail the attach. strp_done needs the strp to be stopped and
warns if it isn't. Call strp_stop in this case to eliminate the
warning message.

Reported-by: syzbot+88dfb55e4c8b770d86e3@syzkaller.appspotmail.com
Fixes: e5571240236c5652f ("kcm: Check if sk_user_data already set in kcm_attach"
Signed-off-by: Tom Herbert <tom@quantonium.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/kcm/kcmsock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index f297d53a11aa..435594648dac 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1417,6 +1417,7 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
 	 */
 	if (csk->sk_user_data) {
 		write_unlock_bh(&csk->sk_callback_lock);
+		strp_stop(&psock->strp);
 		strp_done(&psock->strp);
 		kmem_cache_free(kcm_psockp, psock);
 		return -EALREADY;
-- 
cgit v1.2.3


From d8d211a2a0c37755a8660dc69f97b7c70bf210b1 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 14 Feb 2018 16:39:56 +0300
Subject: net: Make extern and export get_net_ns()

This function will be used to obtain net of tun device.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index d83e804d5e65..ab58e57c09ca 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -990,10 +990,11 @@ static long sock_do_ioctl(struct net *net, struct socket *sock,
  *	what to do with it - that's up to the protocol still.
  */
 
-static struct ns_common *get_net_ns(struct ns_common *ns)
+struct ns_common *get_net_ns(struct ns_common *ns)
 {
 	return &get_net(container_of(ns, struct net, ns))->ns;
 }
+EXPORT_SYMBOL_GPL(get_net_ns);
 
 static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-- 
cgit v1.2.3


From 68e813aa43071377b698c662bc0214f2a833bcbb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Wed, 14 Feb 2018 14:24:28 -0800
Subject: net/ipv4: Remove fib table id from rtable

Remove rt_table_id from rtable. It was added for getroute to return the
table id that was hit in the lookup. With the changes for fibmatch the
table id can be extracted from the fib_info returned in the fib_result
so it no longer needs to be in rtable directly.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c        | 9 +--------
 net/ipv4/xfrm4_policy.c | 1 -
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6ce623e3e2ab..5ca7415cd48c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1509,7 +1509,6 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 		rt->rt_pmtu = 0;
 		rt->rt_gateway = 0;
 		rt->rt_uses_gateway = 0;
-		rt->rt_table_id = 0;
 		INIT_LIST_HEAD(&rt->rt_uncached);
 
 		rt->dst.output = ip_output;
@@ -1727,8 +1726,6 @@ rt_cache:
 	}
 
 	rth->rt_is_input = 1;
-	if (res->table)
-		rth->rt_table_id = res->table->tb_id;
 	RT_CACHE_STAT_INC(in_slow_tot);
 
 	rth->dst.input = ip_forward;
@@ -2001,8 +1998,6 @@ local_input:
 	rth->dst.tclassid = itag;
 #endif
 	rth->rt_is_input = 1;
-	if (res->table)
-		rth->rt_table_id = res->table->tb_id;
 
 	RT_CACHE_STAT_INC(in_slow_tot);
 	if (res->type == RTN_UNREACHABLE) {
@@ -2231,8 +2226,6 @@ add:
 		return ERR_PTR(-ENOBUFS);
 
 	rth->rt_iif = orig_oif;
-	if (res->table)
-		rth->rt_table_id = res->table->tb_id;
 
 	RT_CACHE_STAT_INC(out_slow_tot);
 
@@ -2762,7 +2755,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		rt->rt_flags |= RTCF_NOTIFY;
 
 	if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
-		table_id = rt->rt_table_id;
+		table_id = res.table ? res.table->tb_id : 0;
 
 	if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
 		if (!res.fi) {
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 753f526cf9db..796ac4115485 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -100,7 +100,6 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 	xdst->u.rt.rt_gateway = rt->rt_gateway;
 	xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
 	xdst->u.rt.rt_pmtu = rt->rt_pmtu;
-	xdst->u.rt.rt_table_id = rt->rt_table_id;
 	INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
 
 	return 0;
-- 
cgit v1.2.3


From 27469b7352b5197cffa0e3dadb5f1127f055da27 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:42 +0100
Subject: tipc: remove redundant code in topology server

The socket handling in the topology server is unnecessarily generic.
It is prepared to handle both SOCK_RDM, SOCK_DGRAM and SOCK_STREAM
type sockets, as well as the only socket type which is really used,
SOCK_SEQPACKET.

We now remove this redundant code to make the code more readable.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c | 36 +++++++-----------------------------
 net/tipc/server.h |  4 +---
 net/tipc/subscr.c |  4 +---
 3 files changed, 9 insertions(+), 35 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index df0c563c90cd..04a6dd99dd65 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -82,7 +82,6 @@ struct tipc_conn {
 struct outqueue_entry {
 	struct list_head list;
 	struct kvec iov;
-	struct sockaddr_tipc dest;
 };
 
 static void tipc_recv_work(struct work_struct *work);
@@ -93,7 +92,6 @@ static void tipc_conn_kref_release(struct kref *kref)
 {
 	struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
 	struct tipc_server *s = con->server;
-	struct sockaddr_tipc *saddr = s->saddr;
 	struct socket *sock = con->sock;
 	struct sock *sk;
 
@@ -103,8 +101,6 @@ static void tipc_conn_kref_release(struct kref *kref)
 			__module_get(sock->ops->owner);
 			__module_get(sk->sk_prot_creator->owner);
 		}
-		saddr->scope = -TIPC_NODE_SCOPE;
-		kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
 		sock_release(sock);
 		con->sock = NULL;
 	}
@@ -325,36 +321,24 @@ static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
 {
 	struct tipc_server *s = con->server;
 	struct socket *sock = NULL;
+	int imp = TIPC_CRITICAL_IMPORTANCE;
 	int ret;
 
 	ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock);
 	if (ret < 0)
 		return NULL;
 	ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
-				(char *)&s->imp, sizeof(s->imp));
+				(char *)&imp, sizeof(imp));
 	if (ret < 0)
 		goto create_err;
 	ret = kernel_bind(sock, (struct sockaddr *)s->saddr, sizeof(*s->saddr));
 	if (ret < 0)
 		goto create_err;
 
-	switch (s->type) {
-	case SOCK_STREAM:
-	case SOCK_SEQPACKET:
-		con->rx_action = tipc_accept_from_sock;
-
-		ret = kernel_listen(sock, 0);
-		if (ret < 0)
-			goto create_err;
-		break;
-	case SOCK_DGRAM:
-	case SOCK_RDM:
-		con->rx_action = tipc_receive_from_sock;
-		break;
-	default:
-		pr_err("Unknown socket type %d\n", s->type);
+	con->rx_action = tipc_accept_from_sock;
+	ret = kernel_listen(sock, 0);
+	if (ret < 0)
 		goto create_err;
-	}
 
 	/* As server's listening socket owner and creator is the same module,
 	 * we have to decrease TIPC module reference count to guarantee that
@@ -444,7 +428,7 @@ static void tipc_clean_outqueues(struct tipc_conn *con)
 }
 
 int tipc_conn_sendmsg(struct tipc_server *s, int conid,
-		      struct sockaddr_tipc *addr, void *data, size_t len)
+		      void *data, size_t len)
 {
 	struct outqueue_entry *e;
 	struct tipc_conn *con;
@@ -464,9 +448,6 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
 		return -ENOMEM;
 	}
 
-	if (addr)
-		memcpy(&e->dest, addr, sizeof(struct sockaddr_tipc));
-
 	spin_lock_bh(&con->outqueue_lock);
 	list_add_tail(&e->list, &con->outqueue);
 	spin_unlock_bh(&con->outqueue_lock);
@@ -575,10 +556,6 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 		if (con->sock) {
 			memset(&msg, 0, sizeof(msg));
 			msg.msg_flags = MSG_DONTWAIT;
-			if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) {
-				msg.msg_name = &e->dest;
-				msg.msg_namelen = sizeof(struct sockaddr_tipc);
-			}
 			ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1,
 					     e->iov.iov_len);
 			if (ret == -EWOULDBLOCK || ret == 0) {
@@ -591,6 +568,7 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 			evt = e->iov.iov_base;
 			tipc_send_kern_top_evt(s->net, evt);
 		}
+
 		/* Don't starve users filling buffers */
 		if (++count >= MAX_SEND_MSG_COUNT) {
 			cond_resched();
diff --git a/net/tipc/server.h b/net/tipc/server.h
index 64df7513cd70..434736d545c2 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -79,12 +79,10 @@ struct tipc_server {
 				 void *buf, size_t len);
 	struct sockaddr_tipc *saddr;
 	char name[TIPC_SERVER_NAME_LEN];
-	int imp;
-	int type;
 };
 
 int tipc_conn_sendmsg(struct tipc_server *s, int conid,
-		      struct sockaddr_tipc *addr, void *data, size_t len);
+		      void *data, size_t len);
 
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 			     u32 upper, u32 filter, int *conid);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 68e26470c516..eaef826fc06d 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -81,7 +81,7 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub,
 	sub->evt.found_upper = htohl(found_upper, sub->swap);
 	sub->evt.port.ref = htohl(port_ref, sub->swap);
 	sub->evt.port.node = htohl(node, sub->swap);
-	tipc_conn_sendmsg(tn->topsrv, subscriber->conid, NULL,
+	tipc_conn_sendmsg(tn->topsrv, subscriber->conid,
 			  msg_sect.iov_base, msg_sect.iov_len);
 }
 
@@ -375,8 +375,6 @@ int tipc_topsrv_start(struct net *net)
 	}
 	topsrv->net			= net;
 	topsrv->saddr			= saddr;
-	topsrv->imp			= TIPC_CRITICAL_IMPORTANCE;
-	topsrv->type			= SOCK_SEQPACKET;
 	topsrv->max_rcvbuf_size		= sizeof(struct tipc_subscr);
 	topsrv->tipc_conn_recvmsg	= tipc_subscrb_rcv_cb;
 	topsrv->tipc_conn_new		= tipc_subscrb_connect_cb;
-- 
cgit v1.2.3


From c901d26d4a8137f3ad0e5865d331f7c63c42d9f9 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:43 +0100
Subject: tipc: remove unnecessary function pointers

Interaction between the functionality in server.c and subscr.c is
done via function pointers installed in struct server. This makes
the code harder to follow, and doesn't serve any obvious purpose.

Here, we replace the function pointers with direct function calls.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c | 21 ++++++++++-----------
 net/tipc/server.h |  5 -----
 net/tipc/subscr.c | 27 ++++++---------------------
 net/tipc/subscr.h |  4 ++++
 4 files changed, 20 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index 04a6dd99dd65..8aa2a33b1e48 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -33,6 +33,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "subscr.h"
 #include "server.h"
 #include "core.h"
 #include "socket.h"
@@ -182,7 +183,6 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
 
 static void tipc_close_conn(struct tipc_conn *con)
 {
-	struct tipc_server *s = con->server;
 	struct sock *sk = con->sock->sk;
 	bool disconnect = false;
 
@@ -191,7 +191,7 @@ static void tipc_close_conn(struct tipc_conn *con)
 	if (disconnect) {
 		sk->sk_user_data = NULL;
 		if (con->conid)
-			s->tipc_conn_release(con->conid, con->usr_data);
+			tipc_subscrb_delete(con->usr_data);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 
@@ -240,7 +240,6 @@ static int tipc_receive_from_sock(struct tipc_conn *con)
 {
 	struct tipc_server *s = con->server;
 	struct sock *sk = con->sock->sk;
-	struct sockaddr_tipc addr;
 	struct msghdr msg = {};
 	struct kvec iov;
 	void *buf;
@@ -254,7 +253,7 @@ static int tipc_receive_from_sock(struct tipc_conn *con)
 
 	iov.iov_base = buf;
 	iov.iov_len = s->max_rcvbuf_size;
-	msg.msg_name = &addr;
+	msg.msg_name = NULL;
 	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len);
 	ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT);
 	if (ret <= 0) {
@@ -264,8 +263,8 @@ static int tipc_receive_from_sock(struct tipc_conn *con)
 
 	read_lock_bh(&sk->sk_callback_lock);
 	if (test_bit(CF_CONNECTED, &con->flags))
-		ret = s->tipc_conn_recvmsg(sock_net(con->sock->sk), con->conid,
-					   &addr, con->usr_data, buf, ret);
+		ret = tipc_subscrb_rcv(sock_net(con->sock->sk), con->conid,
+				       con->usr_data, buf, ret);
 	read_unlock_bh(&sk->sk_callback_lock);
 	kmem_cache_free(s->rcvbuf_cache, buf);
 	if (ret < 0)
@@ -284,7 +283,6 @@ out_close:
 
 static int tipc_accept_from_sock(struct tipc_conn *con)
 {
-	struct tipc_server *s = con->server;
 	struct socket *sock = con->sock;
 	struct socket *newsock;
 	struct tipc_conn *newcon;
@@ -305,7 +303,8 @@ static int tipc_accept_from_sock(struct tipc_conn *con)
 	tipc_register_callbacks(newsock, newcon);
 
 	/* Notify that new connection is incoming */
-	newcon->usr_data = s->tipc_conn_new(newcon->conid);
+	newcon->usr_data = tipc_subscrb_create(newcon->conid);
+
 	if (!newcon->usr_data) {
 		sock_release(newsock);
 		conn_put(newcon);
@@ -489,7 +488,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 
 	*conid = con->conid;
 	s = con->server;
-	scbr = s->tipc_conn_new(*conid);
+	scbr = tipc_subscrb_create(*conid);
 	if (!scbr) {
 		conn_put(con);
 		return false;
@@ -497,7 +496,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 
 	con->usr_data = scbr;
 	con->sock = NULL;
-	s->tipc_conn_recvmsg(net, *conid, NULL, scbr, &sub, sizeof(sub));
+	tipc_subscrb_rcv(net, *conid, scbr, &sub, sizeof(sub));
 	return true;
 }
 
@@ -513,7 +512,7 @@ void tipc_topsrv_kern_unsubscr(struct net *net, int conid)
 	test_and_clear_bit(CF_CONNECTED, &con->flags);
 	srv = con->server;
 	if (con->conid)
-		srv->tipc_conn_release(con->conid, con->usr_data);
+		tipc_subscrb_delete(con->usr_data);
 	conn_put(con);
 	conn_put(con);
 }
diff --git a/net/tipc/server.h b/net/tipc/server.h
index 434736d545c2..b4b83bdc8b20 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -72,11 +72,6 @@ struct tipc_server {
 	struct workqueue_struct *rcv_wq;
 	struct workqueue_struct *send_wq;
 	int max_rcvbuf_size;
-	void *(*tipc_conn_new)(int conid);
-	void (*tipc_conn_release)(int conid, void *usr_data);
-	int (*tipc_conn_recvmsg)(struct net *net, int conid,
-				 struct sockaddr_tipc *addr, void *usr_data,
-				 void *buf, size_t len);
 	struct sockaddr_tipc *saddr;
 	char name[TIPC_SERVER_NAME_LEN];
 };
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index eaef826fc06d..b86fbbf7a0b9 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -220,7 +220,7 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber,
 	spin_unlock_bh(&subscriber->lock);
 }
 
-static struct tipc_subscriber *tipc_subscrb_create(int conid)
+struct tipc_subscriber *tipc_subscrb_create(int conid)
 {
 	struct tipc_subscriber *subscriber;
 
@@ -237,7 +237,7 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
 	return subscriber;
 }
 
-static void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
+void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
 {
 	tipc_subscrb_subscrp_delete(subscriber, NULL);
 	tipc_subscrb_put(subscriber);
@@ -315,16 +315,10 @@ static int tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
 	return 0;
 }
 
-/* Handle one termination request for the subscriber */
-static void tipc_subscrb_release_cb(int conid, void *usr_data)
-{
-	tipc_subscrb_delete((struct tipc_subscriber *)usr_data);
-}
-
-/* Handle one request to create a new subscription for the subscriber */
-static int tipc_subscrb_rcv_cb(struct net *net, int conid,
-			       struct sockaddr_tipc *addr, void *usr_data,
-			       void *buf, size_t len)
+/* Handle one request to create a new subscription for the subscriber
+ */
+int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data,
+		     void *buf, size_t len)
 {
 	struct tipc_subscriber *subscriber = usr_data;
 	struct tipc_subscr *s = (struct tipc_subscr *)buf;
@@ -345,12 +339,6 @@ static int tipc_subscrb_rcv_cb(struct net *net, int conid,
 	return tipc_subscrp_subscribe(net, s, subscriber, swap, status);
 }
 
-/* Handle one request to establish a new subscriber */
-static void *tipc_subscrb_connect_cb(int conid)
-{
-	return (void *)tipc_subscrb_create(conid);
-}
-
 int tipc_topsrv_start(struct net *net)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
@@ -376,9 +364,6 @@ int tipc_topsrv_start(struct net *net)
 	topsrv->net			= net;
 	topsrv->saddr			= saddr;
 	topsrv->max_rcvbuf_size		= sizeof(struct tipc_subscr);
-	topsrv->tipc_conn_recvmsg	= tipc_subscrb_rcv_cb;
-	topsrv->tipc_conn_new		= tipc_subscrb_connect_cb;
-	topsrv->tipc_conn_release	= tipc_subscrb_release_cb;
 
 	strncpy(topsrv->name, name, strlen(name) + 1);
 	tn->topsrv = topsrv;
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index f3edca775d9f..a736f29ba9ab 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -67,6 +67,10 @@ struct tipc_subscription {
 	struct tipc_event evt;
 };
 
+struct tipc_subscriber *tipc_subscrb_create(int conid);
+void tipc_subscrb_delete(struct tipc_subscriber *subscriber);
+int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data,
+		     void *buf, size_t len);
 int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper);
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
-- 
cgit v1.2.3


From df79d040dcd7d7e580c50edf40b82e677fe84801 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:44 +0100
Subject: tipc: eliminate struct tipc_subscriber

It is unnecessary to keep two structures, struct tipc_conn and struct
tipc_subscriber, with a one-to-one relationship and still with different
life cycles. The fact that the two often run in different contexts, and
still may access each other via direct pointers constitutes an additional
hazard, something we have experienced at several occasions, and still
see happening.

We have identified at least two remaining problems that are easier to
fix if we simplify the topology server data structure somewhat.

- When there is a race between a subscription up/down event and a
  timeout event, it is fully possible that the former might be delivered
  after the latter, leading to confusion for the receiver.

- The function tipc_subcrp_timeout() is executing in interrupt context,
  while the following call chain is at least theoretically possible:
  tipc_subscrp_timeout()
    tipc_subscrp_send_event()
      tipc_conn_sendmsg()
        conn_put()
          tipc_conn_kref_release()
            sock_release(sock)

I.e., we end up calling a function that might try to sleep in
interrupt context. To eliminate this, we need to ensure that the
tipc_conn structure and the socket, as well as the subscription
instances, only are deleted in work queue context, i.e., after the
timeout event really has been sent out.

We now remove this unnecessary complexity, by merging data and
functionality of the subscriber structure into struct tipc_conn
and the associated file server.c. We thereafter add a spinlock and
a new 'inactive' state to the subscription structure. Using those,
both problems described above can be easily solved.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c | 161 ++++++++++++++++++++++++++++++++------------------
 net/tipc/server.h |   2 +-
 net/tipc/subscr.c | 173 ++++++++++--------------------------------------------
 net/tipc/subscr.h |  17 +++---
 4 files changed, 146 insertions(+), 207 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index 8aa2a33b1e48..b8268c0882a7 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -2,6 +2,7 @@
  * net/tipc/server.c: TIPC server infrastructure
  *
  * Copyright (c) 2012-2013, Wind River Systems
+ * Copyright (c) 2017, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -57,12 +58,13 @@
  * @sock: socket handler associated with connection
  * @flags: indicates connection state
  * @server: pointer to connected server
+ * @sub_list: lsit to all pertaing subscriptions
+ * @sub_lock: lock protecting the subscription list
+ * @outqueue_lock: control access to the outqueue
  * @rwork: receive work item
- * @usr_data: user-specified field
  * @rx_action: what to do when connection socket is active
  * @outqueue: pointer to first outbound message in queue
  * @outqueue_lock: control access to the outqueue
- * @outqueue: list of connection objects for its server
  * @swork: send work item
  */
 struct tipc_conn {
@@ -71,9 +73,10 @@ struct tipc_conn {
 	struct socket *sock;
 	unsigned long flags;
 	struct tipc_server *server;
+	struct list_head sub_list;
+	spinlock_t sub_lock; /* for subscription list */
 	struct work_struct rwork;
 	int (*rx_action) (struct tipc_conn *con);
-	void *usr_data;
 	struct list_head outqueue;
 	spinlock_t outqueue_lock;
 	struct work_struct swork;
@@ -81,6 +84,7 @@ struct tipc_conn {
 
 /* An entry waiting to be sent */
 struct outqueue_entry {
+	u32 evt;
 	struct list_head list;
 	struct kvec iov;
 };
@@ -89,18 +93,33 @@ static void tipc_recv_work(struct work_struct *work);
 static void tipc_send_work(struct work_struct *work);
 static void tipc_clean_outqueues(struct tipc_conn *con);
 
+static bool connected(struct tipc_conn *con)
+{
+	return con && test_bit(CF_CONNECTED, &con->flags);
+}
+
+/**
+ * htohl - convert value to endianness used by destination
+ * @in: value to convert
+ * @swap: non-zero if endianness must be reversed
+ *
+ * Returns converted value
+ */
+static u32 htohl(u32 in, int swap)
+{
+	return swap ? swab32(in) : in;
+}
+
 static void tipc_conn_kref_release(struct kref *kref)
 {
 	struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
 	struct tipc_server *s = con->server;
 	struct socket *sock = con->sock;
-	struct sock *sk;
 
 	if (sock) {
-		sk = sock->sk;
 		if (test_bit(CF_SERVER, &con->flags)) {
 			__module_get(sock->ops->owner);
-			__module_get(sk->sk_prot_creator->owner);
+			__module_get(sock->sk->sk_prot_creator->owner);
 		}
 		sock_release(sock);
 		con->sock = NULL;
@@ -129,11 +148,8 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
 
 	spin_lock_bh(&s->idr_lock);
 	con = idr_find(&s->conn_idr, conid);
-	if (con) {
-		if (!test_bit(CF_CONNECTED, &con->flags) ||
-		    !kref_get_unless_zero(&con->kref))
-			con = NULL;
-	}
+	if (!connected(con) || !kref_get_unless_zero(&con->kref))
+		con = NULL;
 	spin_unlock_bh(&s->idr_lock);
 	return con;
 }
@@ -144,7 +160,7 @@ static void sock_data_ready(struct sock *sk)
 
 	read_lock_bh(&sk->sk_callback_lock);
 	con = sock2con(sk);
-	if (con && test_bit(CF_CONNECTED, &con->flags)) {
+	if (connected(con)) {
 		conn_get(con);
 		if (!queue_work(con->server->rcv_wq, &con->rwork))
 			conn_put(con);
@@ -158,7 +174,7 @@ static void sock_write_space(struct sock *sk)
 
 	read_lock_bh(&sk->sk_callback_lock);
 	con = sock2con(sk);
-	if (con && test_bit(CF_CONNECTED, &con->flags)) {
+	if (connected(con)) {
 		conn_get(con);
 		if (!queue_work(con->server->send_wq, &con->swork))
 			conn_put(con);
@@ -181,6 +197,24 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
+/* tipc_con_delete_sub - delete a specific or all subscriptions
+ *  for a given subscriber
+ */
+static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)
+{
+	struct list_head *sub_list = &con->sub_list;
+	struct tipc_subscription *sub, *tmp;
+
+	spin_lock_bh(&con->sub_lock);
+	list_for_each_entry_safe(sub, tmp, sub_list, subscrp_list) {
+		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s)))
+			tipc_sub_delete(sub);
+		else if (s)
+			break;
+	}
+	spin_unlock_bh(&con->sub_lock);
+}
+
 static void tipc_close_conn(struct tipc_conn *con)
 {
 	struct sock *sk = con->sock->sk;
@@ -188,10 +222,11 @@ static void tipc_close_conn(struct tipc_conn *con)
 
 	write_lock_bh(&sk->sk_callback_lock);
 	disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags);
+
 	if (disconnect) {
 		sk->sk_user_data = NULL;
 		if (con->conid)
-			tipc_subscrb_delete(con->usr_data);
+			tipc_con_delete_sub(con, NULL);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 
@@ -215,7 +250,9 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s)
 
 	kref_init(&con->kref);
 	INIT_LIST_HEAD(&con->outqueue);
+	INIT_LIST_HEAD(&con->sub_list);
 	spin_lock_init(&con->outqueue_lock);
+	spin_lock_init(&con->sub_lock);
 	INIT_WORK(&con->swork, tipc_send_work);
 	INIT_WORK(&con->rwork, tipc_recv_work);
 
@@ -236,6 +273,35 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s)
 	return con;
 }
 
+int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con,
+		     void *buf, size_t len)
+{
+	struct tipc_subscr *s = (struct tipc_subscr *)buf;
+	struct tipc_subscription *sub;
+	bool status;
+	int swap;
+
+	/* Determine subscriber's endianness */
+	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE |
+			      TIPC_SUB_CANCEL));
+
+	/* Detect & process a subscription cancellation request */
+	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
+		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
+		tipc_con_delete_sub(con, s);
+		return 0;
+	}
+	status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap));
+	sub = tipc_subscrp_subscribe(net, s, conid, swap, status);
+	if (!sub)
+		return -1;
+
+	spin_lock_bh(&con->sub_lock);
+	list_add(&sub->subscrp_list, &con->sub_list);
+	spin_unlock_bh(&con->sub_lock);
+	return 0;
+}
+
 static int tipc_receive_from_sock(struct tipc_conn *con)
 {
 	struct tipc_server *s = con->server;
@@ -262,9 +328,7 @@ static int tipc_receive_from_sock(struct tipc_conn *con)
 	}
 
 	read_lock_bh(&sk->sk_callback_lock);
-	if (test_bit(CF_CONNECTED, &con->flags))
-		ret = tipc_subscrb_rcv(sock_net(con->sock->sk), con->conid,
-				       con->usr_data, buf, ret);
+	ret = tipc_con_rcv_sub(s->net, con->conid, con, buf, ret);
 	read_unlock_bh(&sk->sk_callback_lock);
 	kmem_cache_free(s->rcvbuf_cache, buf);
 	if (ret < 0)
@@ -302,15 +366,6 @@ static int tipc_accept_from_sock(struct tipc_conn *con)
 	newcon->rx_action = tipc_receive_from_sock;
 	tipc_register_callbacks(newsock, newcon);
 
-	/* Notify that new connection is incoming */
-	newcon->usr_data = tipc_subscrb_create(newcon->conid);
-
-	if (!newcon->usr_data) {
-		sock_release(newsock);
-		conn_put(newcon);
-		return -ENOMEM;
-	}
-
 	/* Wake up receive process in case of 'SYN+' message */
 	newsock->sk->sk_data_ready(newsock->sk);
 	return ret;
@@ -427,7 +482,7 @@ static void tipc_clean_outqueues(struct tipc_conn *con)
 }
 
 int tipc_conn_sendmsg(struct tipc_server *s, int conid,
-		      void *data, size_t len)
+		      u32 evt, void *data, size_t len)
 {
 	struct outqueue_entry *e;
 	struct tipc_conn *con;
@@ -436,7 +491,7 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
 	if (!con)
 		return -EINVAL;
 
-	if (!test_bit(CF_CONNECTED, &con->flags)) {
+	if (!connected(con)) {
 		conn_put(con);
 		return 0;
 	}
@@ -446,7 +501,7 @@ int tipc_conn_sendmsg(struct tipc_server *s, int conid,
 		conn_put(con);
 		return -ENOMEM;
 	}
-
+	e->evt = evt;
 	spin_lock_bh(&con->outqueue_lock);
 	list_add_tail(&e->list, &con->outqueue);
 	spin_unlock_bh(&con->outqueue_lock);
@@ -470,10 +525,9 @@ void tipc_conn_terminate(struct tipc_server *s, int conid)
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 			     u32 upper, u32 filter, int *conid)
 {
-	struct tipc_subscriber *scbr;
 	struct tipc_subscr sub;
-	struct tipc_server *s;
 	struct tipc_conn *con;
+	int rc;
 
 	sub.seq.type = type;
 	sub.seq.lower = lower;
@@ -487,32 +541,23 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 		return false;
 
 	*conid = con->conid;
-	s = con->server;
-	scbr = tipc_subscrb_create(*conid);
-	if (!scbr) {
-		conn_put(con);
-		return false;
-	}
-
-	con->usr_data = scbr;
 	con->sock = NULL;
-	tipc_subscrb_rcv(net, *conid, scbr, &sub, sizeof(sub));
-	return true;
+	rc = tipc_con_rcv_sub(net, *conid, con, &sub, sizeof(sub));
+	if (rc < 0)
+		tipc_close_conn(con);
+	return !rc;
 }
 
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid)
 {
 	struct tipc_conn *con;
-	struct tipc_server *srv;
 
 	con = tipc_conn_lookup(tipc_topsrv(net), conid);
 	if (!con)
 		return;
 
 	test_and_clear_bit(CF_CONNECTED, &con->flags);
-	srv = con->server;
-	if (con->conid)
-		tipc_subscrb_delete(con->usr_data);
+	tipc_con_delete_sub(con, NULL);
 	conn_put(con);
 	conn_put(con);
 }
@@ -537,7 +582,8 @@ static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt)
 
 static void tipc_send_to_sock(struct tipc_conn *con)
 {
-	struct tipc_server *s = con->server;
+	struct list_head *queue = &con->outqueue;
+	struct tipc_server *srv = con->server;
 	struct outqueue_entry *e;
 	struct tipc_event *evt;
 	struct msghdr msg;
@@ -545,16 +591,20 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 	int ret;
 
 	spin_lock_bh(&con->outqueue_lock);
-	while (test_bit(CF_CONNECTED, &con->flags)) {
-		e = list_entry(con->outqueue.next, struct outqueue_entry, list);
-		if ((struct list_head *) e == &con->outqueue)
-			break;
+
+	while (!list_empty(queue)) {
+		e = list_first_entry(queue, struct outqueue_entry, list);
 
 		spin_unlock_bh(&con->outqueue_lock);
 
+		if (e->evt == TIPC_SUBSCR_TIMEOUT) {
+			evt = (struct tipc_event *)e->iov.iov_base;
+			tipc_con_delete_sub(con, &evt->s);
+		}
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_flags = MSG_DONTWAIT;
+
 		if (con->sock) {
-			memset(&msg, 0, sizeof(msg));
-			msg.msg_flags = MSG_DONTWAIT;
 			ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1,
 					     e->iov.iov_len);
 			if (ret == -EWOULDBLOCK || ret == 0) {
@@ -565,7 +615,7 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 			}
 		} else {
 			evt = e->iov.iov_base;
-			tipc_send_kern_top_evt(s->net, evt);
+			tipc_send_kern_top_evt(srv->net, evt);
 		}
 
 		/* Don't starve users filling buffers */
@@ -573,7 +623,6 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 			cond_resched();
 			count = 0;
 		}
-
 		spin_lock_bh(&con->outqueue_lock);
 		list_del(&e->list);
 		tipc_free_entry(e);
@@ -591,7 +640,7 @@ static void tipc_recv_work(struct work_struct *work)
 	struct tipc_conn *con = container_of(work, struct tipc_conn, rwork);
 	int count = 0;
 
-	while (test_bit(CF_CONNECTED, &con->flags)) {
+	while (connected(con)) {
 		if (con->rx_action(con))
 			break;
 
@@ -608,7 +657,7 @@ static void tipc_send_work(struct work_struct *work)
 {
 	struct tipc_conn *con = container_of(work, struct tipc_conn, swork);
 
-	if (test_bit(CF_CONNECTED, &con->flags))
+	if (connected(con))
 		tipc_send_to_sock(con);
 
 	conn_put(con);
diff --git a/net/tipc/server.h b/net/tipc/server.h
index b4b83bdc8b20..fcc72321362d 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -77,7 +77,7 @@ struct tipc_server {
 };
 
 int tipc_conn_sendmsg(struct tipc_server *s, int conid,
-		      void *data, size_t len);
+		      u32 evt, void *data, size_t len);
 
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 			     u32 upper, u32 filter, int *conid);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index b86fbbf7a0b9..c6de1452db69 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/subscr.c: TIPC network topology service
  *
- * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2000-2017, Ericsson AB
  * Copyright (c) 2005-2007, 2010-2013, Wind River Systems
  * All rights reserved.
  *
@@ -38,22 +38,6 @@
 #include "name_table.h"
 #include "subscr.h"
 
-/**
- * struct tipc_subscriber - TIPC network topology subscriber
- * @kref: reference counter to tipc_subscription object
- * @conid: connection identifier to server connecting to subscriber
- * @lock: control access to subscriber
- * @subscrp_list: list of subscription objects for this subscriber
- */
-struct tipc_subscriber {
-	struct kref kref;
-	int conid;
-	spinlock_t lock;
-	struct list_head subscrp_list;
-};
-
-static void tipc_subscrb_put(struct tipc_subscriber *subscriber);
-
 /**
  * htohl - convert value to endianness used by destination
  * @in: value to convert
@@ -71,9 +55,10 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub,
 				    u32 event, u32 port_ref, u32 node)
 {
 	struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
-	struct tipc_subscriber *subscriber = sub->subscriber;
 	struct kvec msg_sect;
 
+	if (sub->inactive)
+		return;
 	msg_sect.iov_base = (void *)&sub->evt;
 	msg_sect.iov_len = sizeof(struct tipc_event);
 	sub->evt.event = htohl(event, sub->swap);
@@ -81,7 +66,7 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub,
 	sub->evt.found_upper = htohl(found_upper, sub->swap);
 	sub->evt.port.ref = htohl(port_ref, sub->swap);
 	sub->evt.port.node = htohl(node, sub->swap);
-	tipc_conn_sendmsg(tn->topsrv, subscriber->conid,
+	tipc_conn_sendmsg(tn->topsrv, sub->conid, event,
 			  msg_sect.iov_base, msg_sect.iov_len);
 }
 
@@ -132,41 +117,22 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
 		return;
 	if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE)
 		return;
-
-	tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
-				node);
+	spin_lock(&sub->lock);
+	tipc_subscrp_send_event(sub, found_lower, found_upper,
+				event, port_ref, node);
+	spin_unlock(&sub->lock);
 }
 
 static void tipc_subscrp_timeout(struct timer_list *t)
 {
 	struct tipc_subscription *sub = from_timer(sub, t, timer);
-	struct tipc_subscriber *subscriber = sub->subscriber;
-
-	spin_lock_bh(&subscriber->lock);
-	tipc_nametbl_unsubscribe(sub);
-	list_del(&sub->subscrp_list);
-	spin_unlock_bh(&subscriber->lock);
+	struct tipc_subscr *s = &sub->evt.s;
 
-	/* Notify subscriber of timeout */
-	tipc_subscrp_send_event(sub, sub->evt.s.seq.lower, sub->evt.s.seq.upper,
+	spin_lock(&sub->lock);
+	tipc_subscrp_send_event(sub, s->seq.lower, s->seq.upper,
 				TIPC_SUBSCR_TIMEOUT, 0, 0);
-
-	tipc_subscrp_put(sub);
-}
-
-static void tipc_subscrb_kref_release(struct kref *kref)
-{
-	kfree(container_of(kref,struct tipc_subscriber, kref));
-}
-
-static void tipc_subscrb_put(struct tipc_subscriber *subscriber)
-{
-	kref_put(&subscriber->kref, tipc_subscrb_kref_release);
-}
-
-static void tipc_subscrb_get(struct tipc_subscriber *subscriber)
-{
-	kref_get(&subscriber->kref);
+	sub->inactive = true;
+	spin_unlock(&sub->lock);
 }
 
 static void tipc_subscrp_kref_release(struct kref *kref)
@@ -175,11 +141,9 @@ static void tipc_subscrp_kref_release(struct kref *kref)
 						     struct tipc_subscription,
 						     kref);
 	struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
-	struct tipc_subscriber *subscriber = sub->subscriber;
 
 	atomic_dec(&tn->subscription_count);
 	kfree(sub);
-	tipc_subscrb_put(subscriber);
 }
 
 void tipc_subscrp_put(struct tipc_subscription *subscription)
@@ -192,68 +156,9 @@ void tipc_subscrp_get(struct tipc_subscription *subscription)
 	kref_get(&subscription->kref);
 }
 
-/* tipc_subscrb_subscrp_delete - delete a specific subscription or all
- * subscriptions for a given subscriber.
- */
-static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber,
-					struct tipc_subscr *s)
-{
-	struct list_head *subscription_list = &subscriber->subscrp_list;
-	struct tipc_subscription *sub, *temp;
-	u32 timeout;
-
-	spin_lock_bh(&subscriber->lock);
-	list_for_each_entry_safe(sub, temp, subscription_list,  subscrp_list) {
-		if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr)))
-			continue;
-
-		timeout = htohl(sub->evt.s.timeout, sub->swap);
-		if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) {
-			tipc_nametbl_unsubscribe(sub);
-			list_del(&sub->subscrp_list);
-			tipc_subscrp_put(sub);
-		}
-
-		if (s)
-			break;
-	}
-	spin_unlock_bh(&subscriber->lock);
-}
-
-struct tipc_subscriber *tipc_subscrb_create(int conid)
-{
-	struct tipc_subscriber *subscriber;
-
-	subscriber = kzalloc(sizeof(*subscriber), GFP_ATOMIC);
-	if (!subscriber) {
-		pr_warn("Subscriber rejected, no memory\n");
-		return NULL;
-	}
-	INIT_LIST_HEAD(&subscriber->subscrp_list);
-	kref_init(&subscriber->kref);
-	subscriber->conid = conid;
-	spin_lock_init(&subscriber->lock);
-
-	return subscriber;
-}
-
-void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
-{
-	tipc_subscrb_subscrp_delete(subscriber, NULL);
-	tipc_subscrb_put(subscriber);
-}
-
-static void tipc_subscrp_cancel(struct tipc_subscr *s,
-				struct tipc_subscriber *subscriber)
-{
-	tipc_subscrb_get(subscriber);
-	tipc_subscrb_subscrp_delete(subscriber, s);
-	tipc_subscrb_put(subscriber);
-}
-
 static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 						     struct tipc_subscr *s,
-						     int swap)
+						     int conid, bool swap)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_subscription *sub;
@@ -275,6 +180,8 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 
 	/* Initialize subscription object */
 	sub->net = net;
+	sub->conid = conid;
+	sub->inactive = false;
 	if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) ||
 	    (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) {
 		pr_warn("Subscription rejected, illegal request\n");
@@ -284,59 +191,39 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 
 	sub->swap = swap;
 	memcpy(&sub->evt.s, s, sizeof(*s));
+	spin_lock_init(&sub->lock);
 	atomic_inc(&tn->subscription_count);
 	kref_init(&sub->kref);
 	return sub;
 }
 
-static int tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
-				  struct tipc_subscriber *subscriber, int swap,
-				  bool status)
+struct tipc_subscription *tipc_subscrp_subscribe(struct net *net,
+						 struct tipc_subscr *s,
+						 int conid, bool swap,
+						 bool status)
 {
 	struct tipc_subscription *sub = NULL;
 	u32 timeout;
 
-	sub = tipc_subscrp_create(net, s, swap);
+	sub = tipc_subscrp_create(net, s, conid, swap);
 	if (!sub)
-		return -1;
+		return NULL;
 
-	spin_lock_bh(&subscriber->lock);
-	list_add(&sub->subscrp_list, &subscriber->subscrp_list);
-	sub->subscriber = subscriber;
 	tipc_nametbl_subscribe(sub, status);
-	tipc_subscrb_get(subscriber);
-	spin_unlock_bh(&subscriber->lock);
-
 	timer_setup(&sub->timer, tipc_subscrp_timeout, 0);
 	timeout = htohl(sub->evt.s.timeout, swap);
-
 	if (timeout != TIPC_WAIT_FOREVER)
 		mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
-	return 0;
+	return sub;
 }
 
-/* Handle one request to create a new subscription for the subscriber
- */
-int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data,
-		     void *buf, size_t len)
+void tipc_sub_delete(struct tipc_subscription *sub)
 {
-	struct tipc_subscriber *subscriber = usr_data;
-	struct tipc_subscr *s = (struct tipc_subscr *)buf;
-	bool status;
-	int swap;
-
-	/* Determine subscriber's endianness */
-	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE |
-			      TIPC_SUB_CANCEL));
-
-	/* Detect & process a subscription cancellation request */
-	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
-		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
-		tipc_subscrp_cancel(s, subscriber);
-		return 0;
-	}
-	status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap));
-	return tipc_subscrp_subscribe(net, s, subscriber, swap, status);
+	tipc_nametbl_unsubscribe(sub);
+	if (sub->evt.s.timeout != TIPC_WAIT_FOREVER)
+		del_timer_sync(&sub->timer);
+	list_del(&sub->subscrp_list);
+	tipc_subscrp_put(sub);
 }
 
 int tipc_topsrv_start(struct net *net)
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index a736f29ba9ab..cfd0cb3a1da8 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -43,7 +43,7 @@
 #define TIPC_MAX_PUBLICATIONS	65535
 
 struct tipc_subscription;
-struct tipc_subscriber;
+struct tipc_conn;
 
 /**
  * struct tipc_subscription - TIPC network topology subscription object
@@ -58,19 +58,22 @@ struct tipc_subscriber;
  */
 struct tipc_subscription {
 	struct kref kref;
-	struct tipc_subscriber *subscriber;
 	struct net *net;
 	struct timer_list timer;
 	struct list_head nameseq_list;
 	struct list_head subscrp_list;
-	int swap;
 	struct tipc_event evt;
+	int conid;
+	bool swap;
+	bool inactive;
+	spinlock_t lock; /* serialize up/down and timer events */
 };
 
-struct tipc_subscriber *tipc_subscrb_create(int conid);
-void tipc_subscrb_delete(struct tipc_subscriber *subscriber);
-int tipc_subscrb_rcv(struct net *net, int conid, void *usr_data,
-		     void *buf, size_t len);
+struct tipc_subscription *tipc_subscrp_subscribe(struct net *net,
+						 struct tipc_subscr *s,
+						 int conid, bool swap,
+						 bool status);
+void tipc_sub_delete(struct tipc_subscription *sub);
 int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper);
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
-- 
cgit v1.2.3


From 414574a0af36d329f560f542e650cc4a81cc1d69 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:45 +0100
Subject: tipc: simplify interaction between subscription and topology
 connection

The message transmission and reception in the topology server is more
generic than is currently necessary. By basing the funtionality on the
fact that we only send items of type struct tipc_event and always
receive items of struct tipc_subcr we can make several simplifications,
and also get rid of some unnecessary dynamic memory allocations.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c |  10 +--
 net/tipc/server.c     | 170 +++++++++++++++++---------------------------------
 net/tipc/server.h     |  12 +---
 net/tipc/subscr.c     |  40 ++++++------
 net/tipc/subscr.h     |   5 +-
 5 files changed, 88 insertions(+), 149 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index ed0457cc99d6..c0ca7be46f7a 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -810,14 +810,15 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
  */
 void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status)
 {
-	struct tipc_net *tn = net_generic(s->net, tipc_net_id);
+	struct tipc_server *srv = s->server;
+	struct tipc_net *tn = tipc_net(srv->net);
 	u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
 	int index = hash(type);
 	struct name_seq *seq;
 	struct tipc_name_seq ns;
 
 	spin_lock_bh(&tn->nametbl_lock);
-	seq = nametbl_find_seq(s->net, type);
+	seq = nametbl_find_seq(srv->net, type);
 	if (!seq)
 		seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
 	if (seq) {
@@ -837,12 +838,13 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status)
  */
 void tipc_nametbl_unsubscribe(struct tipc_subscription *s)
 {
-	struct tipc_net *tn = net_generic(s->net, tipc_net_id);
+	struct tipc_server *srv = s->server;
+	struct tipc_net *tn = tipc_net(srv->net);
 	struct name_seq *seq;
 	u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
 
 	spin_lock_bh(&tn->nametbl_lock);
-	seq = nametbl_find_seq(s->net, type);
+	seq = nametbl_find_seq(srv->net, type);
 	if (seq != NULL) {
 		spin_lock_bh(&seq->lock);
 		list_del_init(&s->nameseq_list);
diff --git a/net/tipc/server.c b/net/tipc/server.c
index b8268c0882a7..7933fb92d852 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -84,9 +84,9 @@ struct tipc_conn {
 
 /* An entry waiting to be sent */
 struct outqueue_entry {
-	u32 evt;
+	bool inactive;
+	struct tipc_event evt;
 	struct list_head list;
-	struct kvec iov;
 };
 
 static void tipc_recv_work(struct work_struct *work);
@@ -154,6 +154,9 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
 	return con;
 }
 
+/* sock_data_ready - interrupt callback indicating the socket has data to read
+ * The queued job is launched in tipc_recv_from_sock()
+ */
 static void sock_data_ready(struct sock *sk)
 {
 	struct tipc_conn *con;
@@ -168,6 +171,10 @@ static void sock_data_ready(struct sock *sk)
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
+/* sock_write_space - interrupt callback after a sendmsg EAGAIN
+ * Indicates that there now is more is space in the send buffer
+ * The queued job is launched in tipc_send_to_sock()
+ */
 static void sock_write_space(struct sock *sk)
 {
 	struct tipc_conn *con;
@@ -273,10 +280,10 @@ static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s)
 	return con;
 }
 
-int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con,
-		     void *buf, size_t len)
+static int tipc_con_rcv_sub(struct tipc_server *srv,
+			    struct tipc_conn *con,
+			    struct tipc_subscr *s)
 {
-	struct tipc_subscr *s = (struct tipc_subscr *)buf;
 	struct tipc_subscription *sub;
 	bool status;
 	int swap;
@@ -292,7 +299,7 @@ int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con,
 		return 0;
 	}
 	status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap));
-	sub = tipc_subscrp_subscribe(net, s, conid, swap, status);
+	sub = tipc_subscrp_subscribe(srv, s, con->conid, swap, status);
 	if (!sub)
 		return -1;
 
@@ -304,43 +311,27 @@ int tipc_con_rcv_sub(struct net *net, int conid, struct tipc_conn *con,
 
 static int tipc_receive_from_sock(struct tipc_conn *con)
 {
-	struct tipc_server *s = con->server;
+	struct tipc_server *srv = con->server;
 	struct sock *sk = con->sock->sk;
 	struct msghdr msg = {};
+	struct tipc_subscr s;
 	struct kvec iov;
-	void *buf;
 	int ret;
 
-	buf = kmem_cache_alloc(s->rcvbuf_cache, GFP_ATOMIC);
-	if (!buf) {
-		ret = -ENOMEM;
-		goto out_close;
-	}
-
-	iov.iov_base = buf;
-	iov.iov_len = s->max_rcvbuf_size;
+	iov.iov_base = &s;
+	iov.iov_len = sizeof(s);
 	msg.msg_name = NULL;
 	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len);
 	ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT);
-	if (ret <= 0) {
-		kmem_cache_free(s->rcvbuf_cache, buf);
-		goto out_close;
+	if (ret == -EWOULDBLOCK)
+		return -EWOULDBLOCK;
+	if (ret > 0) {
+		read_lock_bh(&sk->sk_callback_lock);
+		ret = tipc_con_rcv_sub(srv, con, &s);
+		read_unlock_bh(&sk->sk_callback_lock);
 	}
-
-	read_lock_bh(&sk->sk_callback_lock);
-	ret = tipc_con_rcv_sub(s->net, con->conid, con, buf, ret);
-	read_unlock_bh(&sk->sk_callback_lock);
-	kmem_cache_free(s->rcvbuf_cache, buf);
 	if (ret < 0)
-		tipc_conn_terminate(s, con->conid);
-	return ret;
-
-out_close:
-	if (ret != -EWOULDBLOCK)
 		tipc_close_conn(con);
-	else if (ret == 0)
-		/* Don't return success if we really got EOF */
-		ret = -EAGAIN;
 
 	return ret;
 }
@@ -442,33 +433,6 @@ static int tipc_open_listening_sock(struct tipc_server *s)
 	return 0;
 }
 
-static struct outqueue_entry *tipc_alloc_entry(void *data, int len)
-{
-	struct outqueue_entry *entry;
-	void *buf;
-
-	entry = kmalloc(sizeof(struct outqueue_entry), GFP_ATOMIC);
-	if (!entry)
-		return NULL;
-
-	buf = kmemdup(data, len, GFP_ATOMIC);
-	if (!buf) {
-		kfree(entry);
-		return NULL;
-	}
-
-	entry->iov.iov_base = buf;
-	entry->iov.iov_len = len;
-
-	return entry;
-}
-
-static void tipc_free_entry(struct outqueue_entry *e)
-{
-	kfree(e->iov.iov_base);
-	kfree(e);
-}
-
 static void tipc_clean_outqueues(struct tipc_conn *con)
 {
 	struct outqueue_entry *e, *safe;
@@ -476,50 +440,40 @@ static void tipc_clean_outqueues(struct tipc_conn *con)
 	spin_lock_bh(&con->outqueue_lock);
 	list_for_each_entry_safe(e, safe, &con->outqueue, list) {
 		list_del(&e->list);
-		tipc_free_entry(e);
+		kfree(e);
 	}
 	spin_unlock_bh(&con->outqueue_lock);
 }
 
-int tipc_conn_sendmsg(struct tipc_server *s, int conid,
-		      u32 evt, void *data, size_t len)
+/* tipc_conn_queue_evt - interrupt level call from a subscription instance
+ * The queued job is launched in tipc_send_to_sock()
+ */
+void tipc_conn_queue_evt(struct tipc_server *s, int conid,
+			 u32 event, struct tipc_event *evt)
 {
 	struct outqueue_entry *e;
 	struct tipc_conn *con;
 
 	con = tipc_conn_lookup(s, conid);
 	if (!con)
-		return -EINVAL;
+		return;
 
-	if (!connected(con)) {
-		conn_put(con);
-		return 0;
-	}
+	if (!connected(con))
+		goto err;
 
-	e = tipc_alloc_entry(data, len);
-	if (!e) {
-		conn_put(con);
-		return -ENOMEM;
-	}
-	e->evt = evt;
+	e = kmalloc(sizeof(*e), GFP_ATOMIC);
+	if (!e)
+		goto err;
+	e->inactive = (event == TIPC_SUBSCR_TIMEOUT);
+	memcpy(&e->evt, evt, sizeof(*evt));
 	spin_lock_bh(&con->outqueue_lock);
 	list_add_tail(&e->list, &con->outqueue);
 	spin_unlock_bh(&con->outqueue_lock);
 
-	if (!queue_work(s->send_wq, &con->swork))
-		conn_put(con);
-	return 0;
-}
-
-void tipc_conn_terminate(struct tipc_server *s, int conid)
-{
-	struct tipc_conn *con;
-
-	con = tipc_conn_lookup(s, conid);
-	if (con) {
-		tipc_close_conn(con);
-		conn_put(con);
-	}
+	if (queue_work(s->send_wq, &con->swork))
+		return;
+err:
+	conn_put(con);
 }
 
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
@@ -542,7 +496,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 
 	*conid = con->conid;
 	con->sock = NULL;
-	rc = tipc_con_rcv_sub(net, *conid, con, &sub, sizeof(sub));
+	rc = tipc_con_rcv_sub(tipc_topsrv(net), con, &sub);
 	if (rc < 0)
 		tipc_close_conn(con);
 	return !rc;
@@ -587,6 +541,7 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 	struct outqueue_entry *e;
 	struct tipc_event *evt;
 	struct msghdr msg;
+	struct kvec iov;
 	int count = 0;
 	int ret;
 
@@ -594,27 +549,28 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 
 	while (!list_empty(queue)) {
 		e = list_first_entry(queue, struct outqueue_entry, list);
-
+		evt = &e->evt;
 		spin_unlock_bh(&con->outqueue_lock);
 
-		if (e->evt == TIPC_SUBSCR_TIMEOUT) {
-			evt = (struct tipc_event *)e->iov.iov_base;
+		if (e->inactive)
 			tipc_con_delete_sub(con, &evt->s);
-		}
+
 		memset(&msg, 0, sizeof(msg));
 		msg.msg_flags = MSG_DONTWAIT;
+		iov.iov_base = evt;
+		iov.iov_len = sizeof(*evt);
+		msg.msg_name = NULL;
 
 		if (con->sock) {
-			ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1,
-					     e->iov.iov_len);
+			ret = kernel_sendmsg(con->sock, &msg, &iov,
+					     1, sizeof(*evt));
 			if (ret == -EWOULDBLOCK || ret == 0) {
 				cond_resched();
 				goto out;
 			} else if (ret < 0) {
-				goto send_err;
+				goto err;
 			}
 		} else {
-			evt = e->iov.iov_base;
 			tipc_send_kern_top_evt(srv->net, evt);
 		}
 
@@ -625,13 +581,12 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 		}
 		spin_lock_bh(&con->outqueue_lock);
 		list_del(&e->list);
-		tipc_free_entry(e);
+		kfree(e);
 	}
 	spin_unlock_bh(&con->outqueue_lock);
 out:
 	return;
-
-send_err:
+err:
 	tipc_close_conn(con);
 }
 
@@ -695,22 +650,14 @@ int tipc_server_start(struct tipc_server *s)
 	idr_init(&s->conn_idr);
 	s->idr_in_use = 0;
 
-	s->rcvbuf_cache = kmem_cache_create(s->name, s->max_rcvbuf_size,
-					    0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!s->rcvbuf_cache)
-		return -ENOMEM;
-
 	ret = tipc_work_start(s);
-	if (ret < 0) {
-		kmem_cache_destroy(s->rcvbuf_cache);
+	if (ret < 0)
 		return ret;
-	}
+
 	ret = tipc_open_listening_sock(s);
-	if (ret < 0) {
+	if (ret < 0)
 		tipc_work_stop(s);
-		kmem_cache_destroy(s->rcvbuf_cache);
-		return ret;
-	}
+
 	return ret;
 }
 
@@ -731,6 +678,5 @@ void tipc_server_stop(struct tipc_server *s)
 	spin_unlock_bh(&s->idr_lock);
 
 	tipc_work_stop(s);
-	kmem_cache_destroy(s->rcvbuf_cache);
 	idr_destroy(&s->conn_idr);
 }
diff --git a/net/tipc/server.h b/net/tipc/server.h
index fcc72321362d..2de8709b467d 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -36,6 +36,7 @@
 #ifndef _TIPC_SERVER_H
 #define _TIPC_SERVER_H
 
+#include "core.h"
 #include <linux/idr.h>
 #include <linux/tipc.h>
 #include <net/net_namespace.h>
@@ -68,7 +69,6 @@ struct tipc_server {
 	spinlock_t idr_lock;
 	int idr_in_use;
 	struct net *net;
-	struct kmem_cache *rcvbuf_cache;
 	struct workqueue_struct *rcv_wq;
 	struct workqueue_struct *send_wq;
 	int max_rcvbuf_size;
@@ -76,19 +76,13 @@ struct tipc_server {
 	char name[TIPC_SERVER_NAME_LEN];
 };
 
-int tipc_conn_sendmsg(struct tipc_server *s, int conid,
-		      u32 evt, void *data, size_t len);
+void tipc_conn_queue_evt(struct tipc_server *s, int conid,
+			 u32 event, struct tipc_event *evt);
 
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 			     u32 upper, u32 filter, int *conid);
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid);
 
-/**
- * tipc_conn_terminate - terminate connection with server
- *
- * Note: Must call it in process context since it might sleep
- */
-void tipc_conn_terminate(struct tipc_server *s, int conid);
 int tipc_server_start(struct tipc_server *s);
 
 void tipc_server_stop(struct tipc_server *s);
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index c6de1452db69..c3e0b924e8c2 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -52,22 +52,19 @@ static u32 htohl(u32 in, int swap)
 
 static void tipc_subscrp_send_event(struct tipc_subscription *sub,
 				    u32 found_lower, u32 found_upper,
-				    u32 event, u32 port_ref, u32 node)
+				    u32 event, u32 port, u32 node)
 {
-	struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
-	struct kvec msg_sect;
+	struct tipc_event *evt = &sub->evt;
+	bool swap = sub->swap;
 
 	if (sub->inactive)
 		return;
-	msg_sect.iov_base = (void *)&sub->evt;
-	msg_sect.iov_len = sizeof(struct tipc_event);
-	sub->evt.event = htohl(event, sub->swap);
-	sub->evt.found_lower = htohl(found_lower, sub->swap);
-	sub->evt.found_upper = htohl(found_upper, sub->swap);
-	sub->evt.port.ref = htohl(port_ref, sub->swap);
-	sub->evt.port.node = htohl(node, sub->swap);
-	tipc_conn_sendmsg(tn->topsrv, sub->conid, event,
-			  msg_sect.iov_base, msg_sect.iov_len);
+	evt->event = htohl(event, swap);
+	evt->found_lower = htohl(found_lower, swap);
+	evt->found_upper = htohl(found_upper, swap);
+	evt->port.ref = htohl(port, swap);
+	evt->port.node = htohl(node, swap);
+	tipc_conn_queue_evt(sub->server, sub->conid, event, evt);
 }
 
 /**
@@ -137,10 +134,11 @@ static void tipc_subscrp_timeout(struct timer_list *t)
 
 static void tipc_subscrp_kref_release(struct kref *kref)
 {
-	struct tipc_subscription *sub = container_of(kref,
-						     struct tipc_subscription,
-						     kref);
-	struct tipc_net *tn = net_generic(sub->net, tipc_net_id);
+	struct tipc_subscription *sub;
+	struct tipc_net *tn;
+
+	sub = container_of(kref, struct tipc_subscription, kref);
+	tn = tipc_net(sub->server->net);
 
 	atomic_dec(&tn->subscription_count);
 	kfree(sub);
@@ -156,11 +154,11 @@ void tipc_subscrp_get(struct tipc_subscription *subscription)
 	kref_get(&subscription->kref);
 }
 
-static struct tipc_subscription *tipc_subscrp_create(struct net *net,
+static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv,
 						     struct tipc_subscr *s,
 						     int conid, bool swap)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(srv->net);
 	struct tipc_subscription *sub;
 	u32 filter = htohl(s->filter, swap);
 
@@ -179,7 +177,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 	}
 
 	/* Initialize subscription object */
-	sub->net = net;
+	sub->server = srv;
 	sub->conid = conid;
 	sub->inactive = false;
 	if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) ||
@@ -197,7 +195,7 @@ static struct tipc_subscription *tipc_subscrp_create(struct net *net,
 	return sub;
 }
 
-struct tipc_subscription *tipc_subscrp_subscribe(struct net *net,
+struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv,
 						 struct tipc_subscr *s,
 						 int conid, bool swap,
 						 bool status)
@@ -205,7 +203,7 @@ struct tipc_subscription *tipc_subscrp_subscribe(struct net *net,
 	struct tipc_subscription *sub = NULL;
 	u32 timeout;
 
-	sub = tipc_subscrp_create(net, s, conid, swap);
+	sub = tipc_subscrp_create(srv, s, conid, swap);
 	if (!sub)
 		return NULL;
 
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index cfd0cb3a1da8..64ffd32d15e6 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -49,7 +49,6 @@ struct tipc_conn;
  * struct tipc_subscription - TIPC network topology subscription object
  * @subscriber: pointer to its subscriber
  * @seq: name sequence associated with subscription
- * @net: point to network namespace
  * @timer: timer governing subscription duration (optional)
  * @nameseq_list: adjacent subscriptions in name sequence's subscription list
  * @subscrp_list: adjacent subscriptions in subscriber's subscription list
@@ -58,7 +57,7 @@ struct tipc_conn;
  */
 struct tipc_subscription {
 	struct kref kref;
-	struct net *net;
+	struct tipc_server *server;
 	struct timer_list timer;
 	struct list_head nameseq_list;
 	struct list_head subscrp_list;
@@ -69,7 +68,7 @@ struct tipc_subscription {
 	spinlock_t lock; /* serialize up/down and timer events */
 };
 
-struct tipc_subscription *tipc_subscrp_subscribe(struct net *net,
+struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv,
 						 struct tipc_subscr *s,
 						 int conid, bool swap,
 						 bool status);
-- 
cgit v1.2.3


From 8985ecc7c1e07c42acc1e44ac56fa224f8a5c62f Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:46 +0100
Subject: tipc: simplify endianness handling in topology subscriber

Because of the requirement for total distribution transparency, users
send subscriptions and receive topology events in their own host format.
It is up to the topology server to determine this format and do the
correct conversions to and from its own host format when needed.

Until now, this has been handled in a rather non-transparent way inside
the topology server and subscriber code, leading to unnecessary
complexity when creating subscriptions and issuing events.

We now improve this situation by adding two new macros, tipc_sub_read()
and tipc_evt_write(). Both those functions calculate the need for
conversion internally before performing their respective operations.
Hence, all handling of such conversions become transparent to the rest
of the code.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 42 +++++++++++++++-----------
 net/tipc/name_table.h |  2 +-
 net/tipc/server.c     | 25 ++--------------
 net/tipc/subscr.c     | 83 +++++++++++++++++++--------------------------------
 net/tipc/subscr.h     | 36 ++++++++++++++++------
 5 files changed, 86 insertions(+), 102 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index c0ca7be46f7a..2fbd0a20a958 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -412,18 +412,22 @@ found:
  * sequence overlapping with the requested sequence
  */
 static void tipc_nameseq_subscribe(struct name_seq *nseq,
-				   struct tipc_subscription *s,
-				   bool status)
+				   struct tipc_subscription *sub)
 {
 	struct sub_seq *sseq = nseq->sseqs;
 	struct tipc_name_seq ns;
+	struct tipc_subscr *s = &sub->evt.s;
+	bool no_status;
 
-	tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns);
+	ns.type = tipc_sub_read(s, seq.type);
+	ns.lower = tipc_sub_read(s, seq.lower);
+	ns.upper = tipc_sub_read(s, seq.upper);
+	no_status = tipc_sub_read(s, filter) & TIPC_SUB_NO_STATUS;
 
-	tipc_subscrp_get(s);
-	list_add(&s->nameseq_list, &nseq->subscriptions);
+	tipc_subscrp_get(sub);
+	list_add(&sub->nameseq_list, &nseq->subscriptions);
 
-	if (!status || !sseq)
+	if (no_status || !sseq)
 		return;
 
 	while (sseq != &nseq->sseqs[nseq->first_free]) {
@@ -433,7 +437,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 			int must_report = 1;
 
 			list_for_each_entry(crs, &info->zone_list, zone_list) {
-				tipc_subscrp_report_overlap(s, sseq->lower,
+				tipc_subscrp_report_overlap(sub, sseq->lower,
 							    sseq->upper,
 							    TIPC_PUBLISHED,
 							    crs->ref, crs->node,
@@ -808,11 +812,12 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
 /**
  * tipc_nametbl_subscribe - add a subscription object to the name table
  */
-void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status)
+void tipc_nametbl_subscribe(struct tipc_subscription *sub)
 {
-	struct tipc_server *srv = s->server;
+	struct tipc_server *srv = sub->server;
 	struct tipc_net *tn = tipc_net(srv->net);
-	u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
+	struct tipc_subscr *s = &sub->evt.s;
+	u32 type = tipc_sub_read(s, seq.type);
 	int index = hash(type);
 	struct name_seq *seq;
 	struct tipc_name_seq ns;
@@ -823,10 +828,12 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status)
 		seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
 	if (seq) {
 		spin_lock_bh(&seq->lock);
-		tipc_nameseq_subscribe(seq, s, status);
+		tipc_nameseq_subscribe(seq, sub);
 		spin_unlock_bh(&seq->lock);
 	} else {
-		tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns);
+		ns.type = tipc_sub_read(s, seq.type);
+		ns.lower = tipc_sub_read(s, seq.lower);
+		ns.upper = tipc_sub_read(s, seq.upper);
 		pr_warn("Failed to create subscription for {%u,%u,%u}\n",
 			ns.type, ns.lower, ns.upper);
 	}
@@ -836,19 +843,20 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status)
 /**
  * tipc_nametbl_unsubscribe - remove a subscription object from name table
  */
-void tipc_nametbl_unsubscribe(struct tipc_subscription *s)
+void tipc_nametbl_unsubscribe(struct tipc_subscription *sub)
 {
-	struct tipc_server *srv = s->server;
+	struct tipc_server *srv = sub->server;
+	struct tipc_subscr *s = &sub->evt.s;
 	struct tipc_net *tn = tipc_net(srv->net);
 	struct name_seq *seq;
-	u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
+	u32 type = tipc_sub_read(s, seq.type);
 
 	spin_lock_bh(&tn->nametbl_lock);
 	seq = nametbl_find_seq(srv->net, type);
 	if (seq != NULL) {
 		spin_lock_bh(&seq->lock);
-		list_del_init(&s->nameseq_list);
-		tipc_subscrp_put(s);
+		list_del_init(&sub->nameseq_list);
+		tipc_subscrp_put(sub);
 		if (!seq->first_free && list_empty(&seq->subscriptions)) {
 			hlist_del_init_rcu(&seq->ns_list);
 			kfree(seq->sseqs);
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index f56e7cb3d436..17652602d5e2 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -120,7 +120,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
 struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 					     u32 lower, u32 node, u32 ref,
 					     u32 key);
-void tipc_nametbl_subscribe(struct tipc_subscription *s, bool status);
+void tipc_nametbl_subscribe(struct tipc_subscription *s);
 void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
 int tipc_nametbl_init(struct net *net);
 void tipc_nametbl_stop(struct net *net);
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 7933fb92d852..5d231fa8e4b5 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -98,18 +98,6 @@ static bool connected(struct tipc_conn *con)
 	return con && test_bit(CF_CONNECTED, &con->flags);
 }
 
-/**
- * htohl - convert value to endianness used by destination
- * @in: value to convert
- * @swap: non-zero if endianness must be reversed
- *
- * Returns converted value
- */
-static u32 htohl(u32 in, int swap)
-{
-	return swap ? swab32(in) : in;
-}
-
 static void tipc_conn_kref_release(struct kref *kref)
 {
 	struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
@@ -285,21 +273,12 @@ static int tipc_con_rcv_sub(struct tipc_server *srv,
 			    struct tipc_subscr *s)
 {
 	struct tipc_subscription *sub;
-	bool status;
-	int swap;
-
-	/* Determine subscriber's endianness */
-	swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE |
-			      TIPC_SUB_CANCEL));
 
-	/* Detect & process a subscription cancellation request */
-	if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
-		s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
+	if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) {
 		tipc_con_delete_sub(con, s);
 		return 0;
 	}
-	status = !(s->filter & htohl(TIPC_SUB_NO_STATUS, swap));
-	sub = tipc_subscrp_subscribe(srv, s, con->conid, swap, status);
+	sub = tipc_subscrp_subscribe(srv, s, con->conid);
 	if (!sub)
 		return -1;
 
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index c3e0b924e8c2..406b09fc227b 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -38,32 +38,19 @@
 #include "name_table.h"
 #include "subscr.h"
 
-/**
- * htohl - convert value to endianness used by destination
- * @in: value to convert
- * @swap: non-zero if endianness must be reversed
- *
- * Returns converted value
- */
-static u32 htohl(u32 in, int swap)
-{
-	return swap ? swab32(in) : in;
-}
-
 static void tipc_subscrp_send_event(struct tipc_subscription *sub,
 				    u32 found_lower, u32 found_upper,
 				    u32 event, u32 port, u32 node)
 {
 	struct tipc_event *evt = &sub->evt;
-	bool swap = sub->swap;
 
 	if (sub->inactive)
 		return;
-	evt->event = htohl(event, swap);
-	evt->found_lower = htohl(found_lower, swap);
-	evt->found_upper = htohl(found_upper, swap);
-	evt->port.ref = htohl(port, swap);
-	evt->port.node = htohl(node, swap);
+	tipc_evt_write(evt, event, event);
+	tipc_evt_write(evt, found_lower, found_lower);
+	tipc_evt_write(evt, found_upper, found_upper);
+	tipc_evt_write(evt, port.ref, port);
+	tipc_evt_write(evt, port.node, node);
 	tipc_conn_queue_evt(sub->server, sub->conid, event, evt);
 }
 
@@ -85,29 +72,22 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 	return 1;
 }
 
-u32 tipc_subscrp_convert_seq_type(u32 type, int swap)
+void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
+				 u32 found_lower, u32 found_upper,
+				 u32 event, u32 port, u32 node,
+				 u32 scope, int must)
 {
-	return htohl(type, swap);
-}
-
-void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
-			      struct tipc_name_seq *out)
-{
-	out->type = htohl(in->type, swap);
-	out->lower = htohl(in->lower, swap);
-	out->upper = htohl(in->upper, swap);
-}
-
-void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
-				 u32 found_upper, u32 event, u32 port_ref,
-				 u32 node, u32 scope, int must)
-{
-	u32 filter = htohl(sub->evt.s.filter, sub->swap);
 	struct tipc_name_seq seq;
+	struct tipc_subscr *s = &sub->evt.s;
+	u32 filter = tipc_sub_read(s, filter);
+
+	seq.type = tipc_sub_read(s, seq.type);
+	seq.lower = tipc_sub_read(s, seq.lower);
+	seq.upper = tipc_sub_read(s, seq.upper);
 
-	tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
 	if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
 		return;
+
 	if (!must && !(filter & TIPC_SUB_PORTS))
 		return;
 	if (filter & TIPC_SUB_CLUSTER_SCOPE && scope == TIPC_NODE_SCOPE)
@@ -116,7 +96,7 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
 		return;
 	spin_lock(&sub->lock);
 	tipc_subscrp_send_event(sub, found_lower, found_upper,
-				event, port_ref, node);
+				event, port, node);
 	spin_unlock(&sub->lock);
 }
 
@@ -156,11 +136,11 @@ void tipc_subscrp_get(struct tipc_subscription *subscription)
 
 static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv,
 						     struct tipc_subscr *s,
-						     int conid, bool swap)
+						     int conid)
 {
 	struct tipc_net *tn = tipc_net(srv->net);
 	struct tipc_subscription *sub;
-	u32 filter = htohl(s->filter, swap);
+	u32 filter = tipc_sub_read(s, filter);
 
 	/* Refuse subscription if global limit exceeded */
 	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) {
@@ -177,39 +157,38 @@ static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv,
 	}
 
 	/* Initialize subscription object */
+	if (filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE)
+		goto err;
+	if (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))
+		goto err;
 	sub->server = srv;
 	sub->conid = conid;
 	sub->inactive = false;
-	if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) ||
-	    (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) {
-		pr_warn("Subscription rejected, illegal request\n");
-		kfree(sub);
-		return NULL;
-	}
-
-	sub->swap = swap;
 	memcpy(&sub->evt.s, s, sizeof(*s));
 	spin_lock_init(&sub->lock);
 	atomic_inc(&tn->subscription_count);
 	kref_init(&sub->kref);
 	return sub;
+err:
+	pr_warn("Subscription rejected, illegal request\n");
+	kfree(sub);
+	return NULL;
 }
 
 struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv,
 						 struct tipc_subscr *s,
-						 int conid, bool swap,
-						 bool status)
+						 int conid)
 {
 	struct tipc_subscription *sub = NULL;
 	u32 timeout;
 
-	sub = tipc_subscrp_create(srv, s, conid, swap);
+	sub = tipc_subscrp_create(srv, s, conid);
 	if (!sub)
 		return NULL;
 
-	tipc_nametbl_subscribe(sub, status);
+	tipc_nametbl_subscribe(sub);
 	timer_setup(&sub->timer, tipc_subscrp_timeout, 0);
-	timeout = htohl(sub->evt.s.timeout, swap);
+	timeout = tipc_sub_read(&sub->evt.s, timeout);
 	if (timeout != TIPC_WAIT_FOREVER)
 		mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
 	return sub;
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 64ffd32d15e6..db80e41bba08 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -52,7 +52,6 @@ struct tipc_conn;
  * @timer: timer governing subscription duration (optional)
  * @nameseq_list: adjacent subscriptions in name sequence's subscription list
  * @subscrp_list: adjacent subscriptions in subscriber's subscription list
- * @swap: indicates if subscriber uses opposite endianness in its messages
  * @evt: template for events generated by subscription
  */
 struct tipc_subscription {
@@ -63,28 +62,47 @@ struct tipc_subscription {
 	struct list_head subscrp_list;
 	struct tipc_event evt;
 	int conid;
-	bool swap;
 	bool inactive;
 	spinlock_t lock; /* serialize up/down and timer events */
 };
 
 struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv,
 						 struct tipc_subscr *s,
-						 int conid, bool swap,
-						 bool status);
+						 int conid);
 void tipc_sub_delete(struct tipc_subscription *sub);
+
 int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper);
 void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
-				 u32 found_lower, u32 found_upper, u32 event,
-				 u32 port_ref, u32 node, u32 scope, int must);
-void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
-			      struct tipc_name_seq *out);
-u32 tipc_subscrp_convert_seq_type(u32 type, int swap);
+				 u32 found_lower, u32 found_upper,
+				 u32 event, u32 port, u32 node,
+				 u32 scope, int must);
 int tipc_topsrv_start(struct net *net);
 void tipc_topsrv_stop(struct net *net);
 
 void tipc_subscrp_put(struct tipc_subscription *subscription);
 void tipc_subscrp_get(struct tipc_subscription *subscription);
 
+#define TIPC_FILTER_MASK (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | TIPC_SUB_CANCEL)
+
+/* tipc_sub_read - return field_ of struct sub_ in host endian format
+ */
+#define tipc_sub_read(sub_, field_)					\
+	({								\
+		struct tipc_subscr *sub__ = sub_;			\
+		u32 val__ = (sub__)->field_;				\
+		int swap_ = !((sub__)->filter & TIPC_FILTER_MASK);	\
+		(swap_ ? swab32(val__) : val__);			\
+	})
+
+/* tipc_evt_write - write val_ to field_ of struct evt_ in user endian format
+ */
+#define tipc_evt_write(evt_, field_, val_)				\
+	({								\
+		struct tipc_event *evt__ = evt_;			\
+		u32 val__ = val_;					\
+		int swap_ = !((evt__)->s.filter & (TIPC_FILTER_MASK));	\
+		(evt__)->field_ = swap_ ? swab32(val__) : val__;	\
+	})
+
 #endif
-- 
cgit v1.2.3


From 242e82cc95f6b4e83e1771f9915edcb2a63708e1 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:47 +0100
Subject: tipc: collapse subscription creation functions

After the previous changes it becomes logical to collapse the two-level
creation of subscription instances into one. We do that here.

We also rename the creation and deletion functions for more consistency.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c |  4 ++--
 net/tipc/server.h |  1 +
 net/tipc/subscr.c | 46 ++++++++++++----------------------------------
 net/tipc/subscr.h | 14 +++++++-------
 4 files changed, 22 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index 5d231fa8e4b5..6a18b10ac271 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -203,7 +203,7 @@ static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)
 	spin_lock_bh(&con->sub_lock);
 	list_for_each_entry_safe(sub, tmp, sub_list, subscrp_list) {
 		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s)))
-			tipc_sub_delete(sub);
+			tipc_sub_unsubscribe(sub);
 		else if (s)
 			break;
 	}
@@ -278,7 +278,7 @@ static int tipc_con_rcv_sub(struct tipc_server *srv,
 		tipc_con_delete_sub(con, s);
 		return 0;
 	}
-	sub = tipc_subscrp_subscribe(srv, s, con->conid);
+	sub = tipc_sub_subscribe(srv, s, con->conid);
 	if (!sub)
 		return -1;
 
diff --git a/net/tipc/server.h b/net/tipc/server.h
index 2de8709b467d..995b79591ffe 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -2,6 +2,7 @@
  * net/tipc/server.h: Include file for TIPC server code
  *
  * Copyright (c) 2012-2013, Wind River Systems
+ * Copyright (c) 2017, Ericsson AB
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 406b09fc227b..8d37b61e3163 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -134,33 +134,29 @@ void tipc_subscrp_get(struct tipc_subscription *subscription)
 	kref_get(&subscription->kref);
 }
 
-static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv,
-						     struct tipc_subscr *s,
-						     int conid)
+struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
+					     struct tipc_subscr *s,
+					     int conid)
 {
 	struct tipc_net *tn = tipc_net(srv->net);
 	struct tipc_subscription *sub;
 	u32 filter = tipc_sub_read(s, filter);
+	u32 timeout;
 
-	/* Refuse subscription if global limit exceeded */
-	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) {
-		pr_warn("Subscription rejected, limit reached (%u)\n",
-			TIPC_MAX_SUBSCRIPTIONS);
+	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) {
+		pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR);
+		return NULL;
+	}
+	if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) ||
+	    (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))) {
+		pr_warn("Subscription rejected, illegal request\n");
 		return NULL;
 	}
-
-	/* Allocate subscription object */
 	sub = kmalloc(sizeof(*sub), GFP_ATOMIC);
 	if (!sub) {
 		pr_warn("Subscription rejected, no memory\n");
 		return NULL;
 	}
-
-	/* Initialize subscription object */
-	if (filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE)
-		goto err;
-	if (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))
-		goto err;
 	sub->server = srv;
 	sub->conid = conid;
 	sub->inactive = false;
@@ -168,24 +164,6 @@ static struct tipc_subscription *tipc_subscrp_create(struct tipc_server *srv,
 	spin_lock_init(&sub->lock);
 	atomic_inc(&tn->subscription_count);
 	kref_init(&sub->kref);
-	return sub;
-err:
-	pr_warn("Subscription rejected, illegal request\n");
-	kfree(sub);
-	return NULL;
-}
-
-struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv,
-						 struct tipc_subscr *s,
-						 int conid)
-{
-	struct tipc_subscription *sub = NULL;
-	u32 timeout;
-
-	sub = tipc_subscrp_create(srv, s, conid);
-	if (!sub)
-		return NULL;
-
 	tipc_nametbl_subscribe(sub);
 	timer_setup(&sub->timer, tipc_subscrp_timeout, 0);
 	timeout = tipc_sub_read(&sub->evt.s, timeout);
@@ -194,7 +172,7 @@ struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv,
 	return sub;
 }
 
-void tipc_sub_delete(struct tipc_subscription *sub)
+void tipc_sub_unsubscribe(struct tipc_subscription *sub)
 {
 	tipc_nametbl_unsubscribe(sub);
 	if (sub->evt.s.timeout != TIPC_WAIT_FOREVER)
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index db80e41bba08..2d35f1046754 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -1,7 +1,7 @@
 /*
  * net/tipc/subscr.h: Include file for TIPC network topology service
  *
- * Copyright (c) 2003-2006, Ericsson AB
+ * Copyright (c) 2003-2017, Ericsson AB
  * Copyright (c) 2005-2007, 2012-2013, Wind River Systems
  * All rights reserved.
  *
@@ -39,8 +39,8 @@
 
 #include "server.h"
 
-#define TIPC_MAX_SUBSCRIPTIONS	65535
-#define TIPC_MAX_PUBLICATIONS	65535
+#define TIPC_MAX_SUBSCR         65535
+#define TIPC_MAX_PUBLICATIONS   65535
 
 struct tipc_subscription;
 struct tipc_conn;
@@ -66,10 +66,10 @@ struct tipc_subscription {
 	spinlock_t lock; /* serialize up/down and timer events */
 };
 
-struct tipc_subscription *tipc_subscrp_subscribe(struct tipc_server *srv,
-						 struct tipc_subscr *s,
-						 int conid);
-void tipc_sub_delete(struct tipc_subscription *sub);
+struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
+					     struct tipc_subscr *s,
+					     int conid);
+void tipc_sub_unsubscribe(struct tipc_subscription *sub);
 
 int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 			       u32 found_upper);
-- 
cgit v1.2.3


From da0a75e86ae230f92743c073843d3ea35bd061af Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:48 +0100
Subject: tipc: some prefix changes

Since we now have removed struct tipc_subscriber from the code, and
only struct tipc_subscription remains, there is no longer need for long
and awkward prefixes to distinguish between their pertaining functions.

We now change all tipc_subscrp_* prefixes to tipc_sub_*. This is
a purely cosmetic change.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 33 ++++++++++++++++----------------
 net/tipc/server.c     |  5 ++---
 net/tipc/subscr.c     | 52 +++++++++++++++++++++++++--------------------------
 net/tipc/subscr.h     | 20 ++++++++++----------
 4 files changed, 54 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 2fbd0a20a958..b234b7ee0b84 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -326,10 +326,10 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 
 	/* Any subscriptions waiting for notification?  */
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
-		tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
-					    TIPC_PUBLISHED, publ->ref,
-					    publ->node, publ->scope,
-					    created_subseq);
+		tipc_sub_report_overlap(s, publ->lower, publ->upper,
+					TIPC_PUBLISHED, publ->ref,
+					publ->node, publ->scope,
+					created_subseq);
 	}
 	return publ;
 }
@@ -397,10 +397,9 @@ found:
 
 	/* Notify any waiting subscriptions */
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
-		tipc_subscrp_report_overlap(s, publ->lower, publ->upper,
-					    TIPC_WITHDRAWN, publ->ref,
-					    publ->node, publ->scope,
-					    removed_subseq);
+		tipc_sub_report_overlap(s, publ->lower, publ->upper,
+					TIPC_WITHDRAWN, publ->ref, publ->node,
+					publ->scope, removed_subseq);
 	}
 
 	return publ;
@@ -424,25 +423,25 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 	ns.upper = tipc_sub_read(s, seq.upper);
 	no_status = tipc_sub_read(s, filter) & TIPC_SUB_NO_STATUS;
 
-	tipc_subscrp_get(sub);
+	tipc_sub_get(sub);
 	list_add(&sub->nameseq_list, &nseq->subscriptions);
 
 	if (no_status || !sseq)
 		return;
 
 	while (sseq != &nseq->sseqs[nseq->first_free]) {
-		if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) {
+		if (tipc_sub_check_overlap(&ns, sseq->lower, sseq->upper)) {
 			struct publication *crs;
 			struct name_info *info = sseq->info;
 			int must_report = 1;
 
 			list_for_each_entry(crs, &info->zone_list, zone_list) {
-				tipc_subscrp_report_overlap(sub, sseq->lower,
-							    sseq->upper,
-							    TIPC_PUBLISHED,
-							    crs->ref, crs->node,
-							    crs->scope,
-							    must_report);
+				tipc_sub_report_overlap(sub, sseq->lower,
+							sseq->upper,
+							TIPC_PUBLISHED,
+							crs->ref, crs->node,
+							crs->scope,
+							must_report);
 				must_report = 0;
 			}
 		}
@@ -856,7 +855,7 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub)
 	if (seq != NULL) {
 		spin_lock_bh(&seq->lock);
 		list_del_init(&sub->nameseq_list);
-		tipc_subscrp_put(sub);
+		tipc_sub_put(sub);
 		if (!seq->first_free && list_empty(&seq->subscriptions)) {
 			hlist_del_init_rcu(&seq->ns_list);
 			kfree(seq->sseqs);
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 6a18b10ac271..a5c112ed3bbe 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -201,7 +201,7 @@ static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)
 	struct tipc_subscription *sub, *tmp;
 
 	spin_lock_bh(&con->sub_lock);
-	list_for_each_entry_safe(sub, tmp, sub_list, subscrp_list) {
+	list_for_each_entry_safe(sub, tmp, sub_list, sub_list) {
 		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s)))
 			tipc_sub_unsubscribe(sub);
 		else if (s)
@@ -281,9 +281,8 @@ static int tipc_con_rcv_sub(struct tipc_server *srv,
 	sub = tipc_sub_subscribe(srv, s, con->conid);
 	if (!sub)
 		return -1;
-
 	spin_lock_bh(&con->sub_lock);
-	list_add(&sub->subscrp_list, &con->sub_list);
+	list_add(&sub->sub_list, &con->sub_list);
 	spin_unlock_bh(&con->sub_lock);
 	return 0;
 }
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 8d37b61e3163..3be1e4b6cbfa 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -38,9 +38,9 @@
 #include "name_table.h"
 #include "subscr.h"
 
-static void tipc_subscrp_send_event(struct tipc_subscription *sub,
-				    u32 found_lower, u32 found_upper,
-				    u32 event, u32 port, u32 node)
+static void tipc_sub_send_event(struct tipc_subscription *sub,
+				u32 found_lower, u32 found_upper,
+				u32 event, u32 port, u32 node)
 {
 	struct tipc_event *evt = &sub->evt;
 
@@ -55,13 +55,13 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub,
 }
 
 /**
- * tipc_subscrp_check_overlap - test for subscription overlap with the
+ * tipc_sub_check_overlap - test for subscription overlap with the
  * given values
  *
  * Returns 1 if there is overlap, otherwise 0.
  */
-int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
-			       u32 found_upper)
+int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
+			   u32 found_upper)
 {
 	if (found_lower < seq->lower)
 		found_lower = seq->lower;
@@ -72,20 +72,20 @@ int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
 	return 1;
 }
 
-void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
-				 u32 found_lower, u32 found_upper,
-				 u32 event, u32 port, u32 node,
-				 u32 scope, int must)
+void tipc_sub_report_overlap(struct tipc_subscription *sub,
+			     u32 found_lower, u32 found_upper,
+			     u32 event, u32 port, u32 node,
+			     u32 scope, int must)
 {
-	struct tipc_name_seq seq;
 	struct tipc_subscr *s = &sub->evt.s;
 	u32 filter = tipc_sub_read(s, filter);
+	struct tipc_name_seq seq;
 
 	seq.type = tipc_sub_read(s, seq.type);
 	seq.lower = tipc_sub_read(s, seq.lower);
 	seq.upper = tipc_sub_read(s, seq.upper);
 
-	if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
+	if (!tipc_sub_check_overlap(&seq, found_lower, found_upper))
 		return;
 
 	if (!must && !(filter & TIPC_SUB_PORTS))
@@ -95,24 +95,24 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
 	if (filter & TIPC_SUB_NODE_SCOPE && scope != TIPC_NODE_SCOPE)
 		return;
 	spin_lock(&sub->lock);
-	tipc_subscrp_send_event(sub, found_lower, found_upper,
-				event, port, node);
+	tipc_sub_send_event(sub, found_lower, found_upper,
+			    event, port, node);
 	spin_unlock(&sub->lock);
 }
 
-static void tipc_subscrp_timeout(struct timer_list *t)
+static void tipc_sub_timeout(struct timer_list *t)
 {
 	struct tipc_subscription *sub = from_timer(sub, t, timer);
 	struct tipc_subscr *s = &sub->evt.s;
 
 	spin_lock(&sub->lock);
-	tipc_subscrp_send_event(sub, s->seq.lower, s->seq.upper,
-				TIPC_SUBSCR_TIMEOUT, 0, 0);
+	tipc_sub_send_event(sub, s->seq.lower, s->seq.upper,
+			    TIPC_SUBSCR_TIMEOUT, 0, 0);
 	sub->inactive = true;
 	spin_unlock(&sub->lock);
 }
 
-static void tipc_subscrp_kref_release(struct kref *kref)
+static void tipc_sub_kref_release(struct kref *kref)
 {
 	struct tipc_subscription *sub;
 	struct tipc_net *tn;
@@ -124,12 +124,12 @@ static void tipc_subscrp_kref_release(struct kref *kref)
 	kfree(sub);
 }
 
-void tipc_subscrp_put(struct tipc_subscription *subscription)
+void tipc_sub_put(struct tipc_subscription *subscription)
 {
-	kref_put(&subscription->kref, tipc_subscrp_kref_release);
+	kref_put(&subscription->kref, tipc_sub_kref_release);
 }
 
-void tipc_subscrp_get(struct tipc_subscription *subscription)
+void tipc_sub_get(struct tipc_subscription *subscription)
 {
 	kref_get(&subscription->kref);
 }
@@ -139,8 +139,8 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
 					     int conid)
 {
 	struct tipc_net *tn = tipc_net(srv->net);
-	struct tipc_subscription *sub;
 	u32 filter = tipc_sub_read(s, filter);
+	struct tipc_subscription *sub;
 	u32 timeout;
 
 	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) {
@@ -165,7 +165,7 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
 	atomic_inc(&tn->subscription_count);
 	kref_init(&sub->kref);
 	tipc_nametbl_subscribe(sub);
-	timer_setup(&sub->timer, tipc_subscrp_timeout, 0);
+	timer_setup(&sub->timer, tipc_sub_timeout, 0);
 	timeout = tipc_sub_read(&sub->evt.s, timeout);
 	if (timeout != TIPC_WAIT_FOREVER)
 		mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
@@ -177,16 +177,16 @@ void tipc_sub_unsubscribe(struct tipc_subscription *sub)
 	tipc_nametbl_unsubscribe(sub);
 	if (sub->evt.s.timeout != TIPC_WAIT_FOREVER)
 		del_timer_sync(&sub->timer);
-	list_del(&sub->subscrp_list);
-	tipc_subscrp_put(sub);
+	list_del(&sub->sub_list);
+	tipc_sub_put(sub);
 }
 
 int tipc_topsrv_start(struct net *net)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	const char name[] = "topology_server";
-	struct tipc_server *topsrv;
 	struct sockaddr_tipc *saddr;
+	struct tipc_server *topsrv;
 
 	saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC);
 	if (!saddr)
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 2d35f1046754..720932896ba6 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -51,7 +51,7 @@ struct tipc_conn;
  * @seq: name sequence associated with subscription
  * @timer: timer governing subscription duration (optional)
  * @nameseq_list: adjacent subscriptions in name sequence's subscription list
- * @subscrp_list: adjacent subscriptions in subscriber's subscription list
+ * @sub_list: adjacent subscriptions in subscriber's subscription list
  * @evt: template for events generated by subscription
  */
 struct tipc_subscription {
@@ -59,7 +59,7 @@ struct tipc_subscription {
 	struct tipc_server *server;
 	struct timer_list timer;
 	struct list_head nameseq_list;
-	struct list_head subscrp_list;
+	struct list_head sub_list;
 	struct tipc_event evt;
 	int conid;
 	bool inactive;
@@ -71,17 +71,17 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
 					     int conid);
 void tipc_sub_unsubscribe(struct tipc_subscription *sub);
 
-int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
-			       u32 found_upper);
-void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
-				 u32 found_lower, u32 found_upper,
-				 u32 event, u32 port, u32 node,
-				 u32 scope, int must);
+int tipc_sub_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
+			   u32 found_upper);
+void tipc_sub_report_overlap(struct tipc_subscription *sub,
+			     u32 found_lower, u32 found_upper,
+			     u32 event, u32 port, u32 node,
+			     u32 scope, int must);
 int tipc_topsrv_start(struct net *net);
 void tipc_topsrv_stop(struct net *net);
 
-void tipc_subscrp_put(struct tipc_subscription *subscription);
-void tipc_subscrp_get(struct tipc_subscription *subscription);
+void tipc_sub_put(struct tipc_subscription *subscription);
+void tipc_sub_get(struct tipc_subscription *subscription);
 
 #define TIPC_FILTER_MASK (TIPC_SUB_PORTS | TIPC_SUB_SERVICE | TIPC_SUB_CANCEL)
 
-- 
cgit v1.2.3


From 5c45ab24ac77ea32eae7d3576cf37c3ddb259f80 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:49 +0100
Subject: tipc: make struct tipc_server private for server.c

In order to narrow the interface and dependencies between the topology
server and the subscription/binding table functionality we move struct
tipc_server inside the file server.c. This requires some code
adaptations in other files, but those are mostly minor.

The most important change is that we have to move the start/stop
functions for the topology server to server.c, where they logically
belong anyway.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c |  10 ++--
 net/tipc/server.c     | 124 ++++++++++++++++++++++++++++++++++++++++----------
 net/tipc/server.h     |  36 +--------------
 net/tipc/subscr.c     |  64 ++------------------------
 net/tipc/subscr.h     |   4 +-
 5 files changed, 110 insertions(+), 128 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index b234b7ee0b84..e01c9c691ba2 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -813,8 +813,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
  */
 void tipc_nametbl_subscribe(struct tipc_subscription *sub)
 {
-	struct tipc_server *srv = sub->server;
-	struct tipc_net *tn = tipc_net(srv->net);
+	struct tipc_net *tn = tipc_net(sub->net);
 	struct tipc_subscr *s = &sub->evt.s;
 	u32 type = tipc_sub_read(s, seq.type);
 	int index = hash(type);
@@ -822,7 +821,7 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub)
 	struct tipc_name_seq ns;
 
 	spin_lock_bh(&tn->nametbl_lock);
-	seq = nametbl_find_seq(srv->net, type);
+	seq = nametbl_find_seq(sub->net, type);
 	if (!seq)
 		seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
 	if (seq) {
@@ -844,14 +843,13 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub)
  */
 void tipc_nametbl_unsubscribe(struct tipc_subscription *sub)
 {
-	struct tipc_server *srv = sub->server;
 	struct tipc_subscr *s = &sub->evt.s;
-	struct tipc_net *tn = tipc_net(srv->net);
+	struct tipc_net *tn = tipc_net(sub->net);
 	struct name_seq *seq;
 	u32 type = tipc_sub_read(s, seq.type);
 
 	spin_lock_bh(&tn->nametbl_lock);
-	seq = nametbl_find_seq(srv->net, type);
+	seq = nametbl_find_seq(sub->net, type);
 	if (seq != NULL) {
 		spin_lock_bh(&seq->lock);
 		list_del_init(&sub->nameseq_list);
diff --git a/net/tipc/server.c b/net/tipc/server.c
index a5c112ed3bbe..0abbdd698662 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -49,7 +49,37 @@
 #define CF_CONNECTED		1
 #define CF_SERVER		2
 
-#define sock2con(x) ((struct tipc_conn *)(x)->sk_user_data)
+#define TIPC_SERVER_NAME_LEN	32
+
+/**
+ * struct tipc_server - TIPC server structure
+ * @conn_idr: identifier set of connection
+ * @idr_lock: protect the connection identifier set
+ * @idr_in_use: amount of allocated identifier entry
+ * @net: network namspace instance
+ * @rcvbuf_cache: memory cache of server receive buffer
+ * @rcv_wq: receive workqueue
+ * @send_wq: send workqueue
+ * @max_rcvbuf_size: maximum permitted receive message length
+ * @tipc_conn_new: callback will be called when new connection is incoming
+ * @tipc_conn_release: callback will be called before releasing the connection
+ * @tipc_conn_recvmsg: callback will be called when message arrives
+ * @saddr: TIPC server address
+ * @name: server name
+ * @imp: message importance
+ * @type: socket type
+ */
+struct tipc_server {
+	struct idr conn_idr;
+	spinlock_t idr_lock; /* for idr list */
+	int idr_in_use;
+	struct net *net;
+	struct workqueue_struct *rcv_wq;
+	struct workqueue_struct *send_wq;
+	int max_rcvbuf_size;
+	struct sockaddr_tipc *saddr;
+	char name[TIPC_SERVER_NAME_LEN];
+};
 
 /**
  * struct tipc_conn - TIPC connection structure
@@ -93,6 +123,11 @@ static void tipc_recv_work(struct work_struct *work);
 static void tipc_send_work(struct work_struct *work);
 static void tipc_clean_outqueues(struct tipc_conn *con);
 
+static struct tipc_conn *sock2con(struct sock *sk)
+{
+	return sk->sk_user_data;
+}
+
 static bool connected(struct tipc_conn *con)
 {
 	return con && test_bit(CF_CONNECTED, &con->flags);
@@ -198,14 +233,17 @@ static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
 static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)
 {
 	struct list_head *sub_list = &con->sub_list;
+	struct tipc_net *tn = tipc_net(con->server->net);
 	struct tipc_subscription *sub, *tmp;
 
 	spin_lock_bh(&con->sub_lock);
 	list_for_each_entry_safe(sub, tmp, sub_list, sub_list) {
-		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s)))
+		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) {
 			tipc_sub_unsubscribe(sub);
-		else if (s)
+			atomic_dec(&tn->subscription_count);
+		} else if (s) {
 			break;
+		}
 	}
 	spin_unlock_bh(&con->sub_lock);
 }
@@ -220,8 +258,7 @@ static void tipc_close_conn(struct tipc_conn *con)
 
 	if (disconnect) {
 		sk->sk_user_data = NULL;
-		if (con->conid)
-			tipc_con_delete_sub(con, NULL);
+		tipc_con_delete_sub(con, NULL);
 	}
 	write_unlock_bh(&sk->sk_callback_lock);
 
@@ -272,15 +309,21 @@ static int tipc_con_rcv_sub(struct tipc_server *srv,
 			    struct tipc_conn *con,
 			    struct tipc_subscr *s)
 {
+	struct tipc_net *tn = tipc_net(srv->net);
 	struct tipc_subscription *sub;
 
 	if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) {
 		tipc_con_delete_sub(con, s);
 		return 0;
 	}
-	sub = tipc_sub_subscribe(srv, s, con->conid);
+	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) {
+		pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR);
+		return -1;
+	}
+	sub = tipc_sub_subscribe(srv->net, s, con->conid);
 	if (!sub)
 		return -1;
+	atomic_inc(&tn->subscription_count);
 	spin_lock_bh(&con->sub_lock);
 	list_add(&sub->sub_list, &con->sub_list);
 	spin_unlock_bh(&con->sub_lock);
@@ -426,13 +469,14 @@ static void tipc_clean_outqueues(struct tipc_conn *con)
 /* tipc_conn_queue_evt - interrupt level call from a subscription instance
  * The queued job is launched in tipc_send_to_sock()
  */
-void tipc_conn_queue_evt(struct tipc_server *s, int conid,
+void tipc_conn_queue_evt(struct net *net, int conid,
 			 u32 event, struct tipc_event *evt)
 {
+	struct tipc_server *srv = tipc_topsrv(net);
 	struct outqueue_entry *e;
 	struct tipc_conn *con;
 
-	con = tipc_conn_lookup(s, conid);
+	con = tipc_conn_lookup(srv, conid);
 	if (!con)
 		return;
 
@@ -448,7 +492,7 @@ void tipc_conn_queue_evt(struct tipc_server *s, int conid,
 	list_add_tail(&e->list, &con->outqueue);
 	spin_unlock_bh(&con->outqueue_lock);
 
-	if (queue_work(s->send_wq, &con->swork))
+	if (queue_work(srv->send_wq, &con->swork))
 		return;
 err:
 	conn_put(con);
@@ -620,41 +664,71 @@ static int tipc_work_start(struct tipc_server *s)
 	return 0;
 }
 
-int tipc_server_start(struct tipc_server *s)
+int tipc_topsrv_start(struct net *net)
 {
+	struct tipc_net *tn = tipc_net(net);
+	const char name[] = "topology_server";
+	struct sockaddr_tipc *saddr;
+	struct tipc_server *srv;
 	int ret;
 
-	spin_lock_init(&s->idr_lock);
-	idr_init(&s->conn_idr);
-	s->idr_in_use = 0;
+	saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC);
+	if (!saddr)
+		return -ENOMEM;
+	saddr->family			= AF_TIPC;
+	saddr->addrtype			= TIPC_ADDR_NAMESEQ;
+	saddr->addr.nameseq.type	= TIPC_TOP_SRV;
+	saddr->addr.nameseq.lower	= TIPC_TOP_SRV;
+	saddr->addr.nameseq.upper	= TIPC_TOP_SRV;
+	saddr->scope			= TIPC_NODE_SCOPE;
+
+	srv = kzalloc(sizeof(*srv), GFP_ATOMIC);
+	if (!srv) {
+		kfree(saddr);
+		return -ENOMEM;
+	}
+	srv->net			= net;
+	srv->saddr			= saddr;
+	srv->max_rcvbuf_size		= sizeof(struct tipc_subscr);
+
+	strncpy(srv->name, name, strlen(name) + 1);
+	tn->topsrv = srv;
+	atomic_set(&tn->subscription_count, 0);
+
+	spin_lock_init(&srv->idr_lock);
+	idr_init(&srv->conn_idr);
+	srv->idr_in_use = 0;
 
-	ret = tipc_work_start(s);
+	ret = tipc_work_start(srv);
 	if (ret < 0)
 		return ret;
 
-	ret = tipc_open_listening_sock(s);
+	ret = tipc_open_listening_sock(srv);
 	if (ret < 0)
-		tipc_work_stop(s);
+		tipc_work_stop(srv);
 
 	return ret;
 }
 
-void tipc_server_stop(struct tipc_server *s)
+void tipc_topsrv_stop(struct net *net)
 {
+	struct tipc_server *srv = tipc_topsrv(net);
 	struct tipc_conn *con;
 	int id;
 
-	spin_lock_bh(&s->idr_lock);
-	for (id = 0; s->idr_in_use; id++) {
-		con = idr_find(&s->conn_idr, id);
+	spin_lock_bh(&srv->idr_lock);
+	for (id = 0; srv->idr_in_use; id++) {
+		con = idr_find(&srv->conn_idr, id);
 		if (con) {
-			spin_unlock_bh(&s->idr_lock);
+			spin_unlock_bh(&srv->idr_lock);
 			tipc_close_conn(con);
-			spin_lock_bh(&s->idr_lock);
+			spin_lock_bh(&srv->idr_lock);
 		}
 	}
-	spin_unlock_bh(&s->idr_lock);
+	spin_unlock_bh(&srv->idr_lock);
 
-	tipc_work_stop(s);
-	idr_destroy(&s->conn_idr);
+	tipc_work_stop(srv);
+	idr_destroy(&srv->conn_idr);
+	kfree(srv->saddr);
+	kfree(srv);
 }
diff --git a/net/tipc/server.h b/net/tipc/server.h
index 995b79591ffe..ce93b6d68e41 100644
--- a/net/tipc/server.h
+++ b/net/tipc/server.h
@@ -47,45 +47,11 @@
 #define TIPC_SUB_NODE_SCOPE     0x40
 #define TIPC_SUB_NO_STATUS      0x80
 
-/**
- * struct tipc_server - TIPC server structure
- * @conn_idr: identifier set of connection
- * @idr_lock: protect the connection identifier set
- * @idr_in_use: amount of allocated identifier entry
- * @net: network namspace instance
- * @rcvbuf_cache: memory cache of server receive buffer
- * @rcv_wq: receive workqueue
- * @send_wq: send workqueue
- * @max_rcvbuf_size: maximum permitted receive message length
- * @tipc_conn_new: callback will be called when new connection is incoming
- * @tipc_conn_release: callback will be called before releasing the connection
- * @tipc_conn_recvmsg: callback will be called when message arrives
- * @saddr: TIPC server address
- * @name: server name
- * @imp: message importance
- * @type: socket type
- */
-struct tipc_server {
-	struct idr conn_idr;
-	spinlock_t idr_lock;
-	int idr_in_use;
-	struct net *net;
-	struct workqueue_struct *rcv_wq;
-	struct workqueue_struct *send_wq;
-	int max_rcvbuf_size;
-	struct sockaddr_tipc *saddr;
-	char name[TIPC_SERVER_NAME_LEN];
-};
-
-void tipc_conn_queue_evt(struct tipc_server *s, int conid,
+void tipc_conn_queue_evt(struct net *net, int conid,
 			 u32 event, struct tipc_event *evt);
 
 bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 			     u32 upper, u32 filter, int *conid);
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid);
 
-int tipc_server_start(struct tipc_server *s);
-
-void tipc_server_stop(struct tipc_server *s);
-
 #endif
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 3be1e4b6cbfa..c8146568d04e 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -51,7 +51,7 @@ static void tipc_sub_send_event(struct tipc_subscription *sub,
 	tipc_evt_write(evt, found_upper, found_upper);
 	tipc_evt_write(evt, port.ref, port);
 	tipc_evt_write(evt, port.node, node);
-	tipc_conn_queue_evt(sub->server, sub->conid, event, evt);
+	tipc_conn_queue_evt(sub->net, sub->conid, event, evt);
 }
 
 /**
@@ -114,14 +114,7 @@ static void tipc_sub_timeout(struct timer_list *t)
 
 static void tipc_sub_kref_release(struct kref *kref)
 {
-	struct tipc_subscription *sub;
-	struct tipc_net *tn;
-
-	sub = container_of(kref, struct tipc_subscription, kref);
-	tn = tipc_net(sub->server->net);
-
-	atomic_dec(&tn->subscription_count);
-	kfree(sub);
+	kfree(container_of(kref, struct tipc_subscription, kref));
 }
 
 void tipc_sub_put(struct tipc_subscription *subscription)
@@ -134,19 +127,14 @@ void tipc_sub_get(struct tipc_subscription *subscription)
 	kref_get(&subscription->kref);
 }
 
-struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
+struct tipc_subscription *tipc_sub_subscribe(struct net *net,
 					     struct tipc_subscr *s,
 					     int conid)
 {
-	struct tipc_net *tn = tipc_net(srv->net);
 	u32 filter = tipc_sub_read(s, filter);
 	struct tipc_subscription *sub;
 	u32 timeout;
 
-	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) {
-		pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR);
-		return NULL;
-	}
 	if ((filter & TIPC_SUB_PORTS && filter & TIPC_SUB_SERVICE) ||
 	    (tipc_sub_read(s, seq.lower) > tipc_sub_read(s, seq.upper))) {
 		pr_warn("Subscription rejected, illegal request\n");
@@ -157,12 +145,11 @@ struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
 		pr_warn("Subscription rejected, no memory\n");
 		return NULL;
 	}
-	sub->server = srv;
+	sub->net = net;
 	sub->conid = conid;
 	sub->inactive = false;
 	memcpy(&sub->evt.s, s, sizeof(*s));
 	spin_lock_init(&sub->lock);
-	atomic_inc(&tn->subscription_count);
 	kref_init(&sub->kref);
 	tipc_nametbl_subscribe(sub);
 	timer_setup(&sub->timer, tipc_sub_timeout, 0);
@@ -180,46 +167,3 @@ void tipc_sub_unsubscribe(struct tipc_subscription *sub)
 	list_del(&sub->sub_list);
 	tipc_sub_put(sub);
 }
-
-int tipc_topsrv_start(struct net *net)
-{
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	const char name[] = "topology_server";
-	struct sockaddr_tipc *saddr;
-	struct tipc_server *topsrv;
-
-	saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC);
-	if (!saddr)
-		return -ENOMEM;
-	saddr->family			= AF_TIPC;
-	saddr->addrtype			= TIPC_ADDR_NAMESEQ;
-	saddr->addr.nameseq.type	= TIPC_TOP_SRV;
-	saddr->addr.nameseq.lower	= TIPC_TOP_SRV;
-	saddr->addr.nameseq.upper	= TIPC_TOP_SRV;
-	saddr->scope			= TIPC_NODE_SCOPE;
-
-	topsrv = kzalloc(sizeof(*topsrv), GFP_ATOMIC);
-	if (!topsrv) {
-		kfree(saddr);
-		return -ENOMEM;
-	}
-	topsrv->net			= net;
-	topsrv->saddr			= saddr;
-	topsrv->max_rcvbuf_size		= sizeof(struct tipc_subscr);
-
-	strncpy(topsrv->name, name, strlen(name) + 1);
-	tn->topsrv = topsrv;
-	atomic_set(&tn->subscription_count, 0);
-
-	return tipc_server_start(topsrv);
-}
-
-void tipc_topsrv_stop(struct net *net)
-{
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct tipc_server *topsrv = tn->topsrv;
-
-	tipc_server_stop(topsrv);
-	kfree(topsrv->saddr);
-	kfree(topsrv);
-}
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 720932896ba6..82ba61afe638 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -56,7 +56,7 @@ struct tipc_conn;
  */
 struct tipc_subscription {
 	struct kref kref;
-	struct tipc_server *server;
+	struct net *net;
 	struct timer_list timer;
 	struct list_head nameseq_list;
 	struct list_head sub_list;
@@ -66,7 +66,7 @@ struct tipc_subscription {
 	spinlock_t lock; /* serialize up/down and timer events */
 };
 
-struct tipc_subscription *tipc_sub_subscribe(struct tipc_server *srv,
+struct tipc_subscription *tipc_sub_subscribe(struct net *net,
 					     struct tipc_subscr *s,
 					     int conid);
 void tipc_sub_unsubscribe(struct tipc_subscription *sub);
-- 
cgit v1.2.3


From 0ef897be12b8b4cf297b6016e79ec97ec90f2cf6 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:50 +0100
Subject: tipc: separate topology server listener socket from subcsriber
 sockets

We move the listener socket to struct tipc_server and give it its own
work item. This makes it easier to follow the code, and entails some
simplifications in the reception code in subscriber sockets.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/server.c | 328 ++++++++++++++++++++++++------------------------------
 1 file changed, 147 insertions(+), 181 deletions(-)

(limited to 'net')

diff --git a/net/tipc/server.c b/net/tipc/server.c
index 0abbdd698662..0e351e81690e 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -64,7 +64,6 @@
  * @tipc_conn_new: callback will be called when new connection is incoming
  * @tipc_conn_release: callback will be called before releasing the connection
  * @tipc_conn_recvmsg: callback will be called when message arrives
- * @saddr: TIPC server address
  * @name: server name
  * @imp: message importance
  * @type: socket type
@@ -74,10 +73,11 @@ struct tipc_server {
 	spinlock_t idr_lock; /* for idr list */
 	int idr_in_use;
 	struct net *net;
+	struct work_struct awork;
 	struct workqueue_struct *rcv_wq;
 	struct workqueue_struct *send_wq;
 	int max_rcvbuf_size;
-	struct sockaddr_tipc *saddr;
+	struct socket *listener;
 	char name[TIPC_SERVER_NAME_LEN];
 };
 
@@ -106,7 +106,6 @@ struct tipc_conn {
 	struct list_head sub_list;
 	spinlock_t sub_lock; /* for subscription list */
 	struct work_struct rwork;
-	int (*rx_action) (struct tipc_conn *con);
 	struct list_head outqueue;
 	spinlock_t outqueue_lock;
 	struct work_struct swork;
@@ -121,12 +120,6 @@ struct outqueue_entry {
 
 static void tipc_recv_work(struct work_struct *work);
 static void tipc_send_work(struct work_struct *work);
-static void tipc_clean_outqueues(struct tipc_conn *con);
-
-static struct tipc_conn *sock2con(struct sock *sk)
-{
-	return sk->sk_user_data;
-}
 
 static bool connected(struct tipc_conn *con)
 {
@@ -137,21 +130,21 @@ static void tipc_conn_kref_release(struct kref *kref)
 {
 	struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
 	struct tipc_server *s = con->server;
-	struct socket *sock = con->sock;
+	struct outqueue_entry *e, *safe;
 
-	if (sock) {
-		if (test_bit(CF_SERVER, &con->flags)) {
-			__module_get(sock->ops->owner);
-			__module_get(sock->sk->sk_prot_creator->owner);
-		}
-		sock_release(sock);
-		con->sock = NULL;
-	}
 	spin_lock_bh(&s->idr_lock);
 	idr_remove(&s->conn_idr, con->conid);
 	s->idr_in_use--;
 	spin_unlock_bh(&s->idr_lock);
-	tipc_clean_outqueues(con);
+	if (con->sock)
+		sock_release(con->sock);
+
+	spin_lock_bh(&con->outqueue_lock);
+	list_for_each_entry_safe(e, safe, &con->outqueue, list) {
+		list_del(&e->list);
+		kfree(e);
+	}
+	spin_unlock_bh(&con->outqueue_lock);
 	kfree(con);
 }
 
@@ -178,14 +171,14 @@ static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
 }
 
 /* sock_data_ready - interrupt callback indicating the socket has data to read
- * The queued job is launched in tipc_recv_from_sock()
+ * The queued work is launched into tipc_recv_work()->tipc_recv_from_sock()
  */
 static void sock_data_ready(struct sock *sk)
 {
 	struct tipc_conn *con;
 
 	read_lock_bh(&sk->sk_callback_lock);
-	con = sock2con(sk);
+	con = sk->sk_user_data;
 	if (connected(con)) {
 		conn_get(con);
 		if (!queue_work(con->server->rcv_wq, &con->rwork))
@@ -195,15 +188,15 @@ static void sock_data_ready(struct sock *sk)
 }
 
 /* sock_write_space - interrupt callback after a sendmsg EAGAIN
- * Indicates that there now is more is space in the send buffer
- * The queued job is launched in tipc_send_to_sock()
+ * Indicates that there now is more space in the send buffer
+ * The queued work is launched into tipc_send_work()->tipc_send_to_sock()
  */
 static void sock_write_space(struct sock *sk)
 {
 	struct tipc_conn *con;
 
 	read_lock_bh(&sk->sk_callback_lock);
-	con = sock2con(sk);
+	con = sk->sk_user_data;
 	if (connected(con)) {
 		conn_get(con);
 		if (!queue_work(con->server->send_wq, &con->swork))
@@ -212,23 +205,8 @@ static void sock_write_space(struct sock *sk)
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
-static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con)
-{
-	struct sock *sk = sock->sk;
-
-	write_lock_bh(&sk->sk_callback_lock);
-
-	sk->sk_data_ready = sock_data_ready;
-	sk->sk_write_space = sock_write_space;
-	sk->sk_user_data = con;
-
-	con->sock = sock;
-
-	write_unlock_bh(&sk->sk_callback_lock);
-}
-
 /* tipc_con_delete_sub - delete a specific or all subscriptions
- *  for a given subscriber
+ * for a given subscriber
  */
 static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)
 {
@@ -268,6 +246,7 @@ static void tipc_close_conn(struct tipc_conn *con)
 
 	/* Don't flush pending works, -just let them expire */
 	kernel_sock_shutdown(con->sock, SHUT_RDWR);
+
 	conn_put(con);
 }
 
@@ -357,117 +336,8 @@ static int tipc_receive_from_sock(struct tipc_conn *con)
 	return ret;
 }
 
-static int tipc_accept_from_sock(struct tipc_conn *con)
-{
-	struct socket *sock = con->sock;
-	struct socket *newsock;
-	struct tipc_conn *newcon;
-	int ret;
-
-	ret = kernel_accept(sock, &newsock, O_NONBLOCK);
-	if (ret < 0)
-		return ret;
-
-	newcon = tipc_alloc_conn(con->server);
-	if (IS_ERR(newcon)) {
-		ret = PTR_ERR(newcon);
-		sock_release(newsock);
-		return ret;
-	}
-
-	newcon->rx_action = tipc_receive_from_sock;
-	tipc_register_callbacks(newsock, newcon);
-
-	/* Wake up receive process in case of 'SYN+' message */
-	newsock->sk->sk_data_ready(newsock->sk);
-	return ret;
-}
-
-static struct socket *tipc_create_listen_sock(struct tipc_conn *con)
-{
-	struct tipc_server *s = con->server;
-	struct socket *sock = NULL;
-	int imp = TIPC_CRITICAL_IMPORTANCE;
-	int ret;
-
-	ret = sock_create_kern(s->net, AF_TIPC, SOCK_SEQPACKET, 0, &sock);
-	if (ret < 0)
-		return NULL;
-	ret = kernel_setsockopt(sock, SOL_TIPC, TIPC_IMPORTANCE,
-				(char *)&imp, sizeof(imp));
-	if (ret < 0)
-		goto create_err;
-	ret = kernel_bind(sock, (struct sockaddr *)s->saddr, sizeof(*s->saddr));
-	if (ret < 0)
-		goto create_err;
-
-	con->rx_action = tipc_accept_from_sock;
-	ret = kernel_listen(sock, 0);
-	if (ret < 0)
-		goto create_err;
-
-	/* As server's listening socket owner and creator is the same module,
-	 * we have to decrease TIPC module reference count to guarantee that
-	 * it remains zero after the server socket is created, otherwise,
-	 * executing "rmmod" command is unable to make TIPC module deleted
-	 * after TIPC module is inserted successfully.
-	 *
-	 * However, the reference count is ever increased twice in
-	 * sock_create_kern(): one is to increase the reference count of owner
-	 * of TIPC socket's proto_ops struct; another is to increment the
-	 * reference count of owner of TIPC proto struct. Therefore, we must
-	 * decrement the module reference count twice to ensure that it keeps
-	 * zero after server's listening socket is created. Of course, we
-	 * must bump the module reference count twice as well before the socket
-	 * is closed.
-	 */
-	module_put(sock->ops->owner);
-	module_put(sock->sk->sk_prot_creator->owner);
-	set_bit(CF_SERVER, &con->flags);
-
-	return sock;
-
-create_err:
-	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sock_release(sock);
-	return NULL;
-}
-
-static int tipc_open_listening_sock(struct tipc_server *s)
-{
-	struct socket *sock;
-	struct tipc_conn *con;
-
-	con = tipc_alloc_conn(s);
-	if (IS_ERR(con))
-		return PTR_ERR(con);
-
-	sock = tipc_create_listen_sock(con);
-	if (!sock) {
-		idr_remove(&s->conn_idr, con->conid);
-		s->idr_in_use--;
-		kfree(con);
-		return -EINVAL;
-	}
-
-	tipc_register_callbacks(sock, con);
-	return 0;
-}
-
-static void tipc_clean_outqueues(struct tipc_conn *con)
-{
-	struct outqueue_entry *e, *safe;
-
-	spin_lock_bh(&con->outqueue_lock);
-	list_for_each_entry_safe(e, safe, &con->outqueue, list) {
-		list_del(&e->list);
-		kfree(e);
-	}
-	spin_unlock_bh(&con->outqueue_lock);
-}
-
-/* tipc_conn_queue_evt - interrupt level call from a subscription instance
- * The queued job is launched in tipc_send_to_sock()
+/* tipc_conn_queue_evt() - interrupt level call from a subscription instance
+ * The queued work is launched into tipc_send_work()->tipc_send_to_sock()
  */
 void tipc_conn_queue_evt(struct net *net, int conid,
 			 u32 event, struct tipc_event *evt)
@@ -588,9 +458,9 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 					     1, sizeof(*evt));
 			if (ret == -EWOULDBLOCK || ret == 0) {
 				cond_resched();
-				goto out;
+				return;
 			} else if (ret < 0) {
-				goto err;
+				return tipc_close_conn(con);
 			}
 		} else {
 			tipc_send_kern_top_evt(srv->net, evt);
@@ -606,10 +476,6 @@ static void tipc_send_to_sock(struct tipc_conn *con)
 		kfree(e);
 	}
 	spin_unlock_bh(&con->outqueue_lock);
-out:
-	return;
-err:
-	tipc_close_conn(con);
 }
 
 static void tipc_recv_work(struct work_struct *work)
@@ -618,7 +484,7 @@ static void tipc_recv_work(struct work_struct *work)
 	int count = 0;
 
 	while (connected(con)) {
-		if (con->rx_action(con))
+		if (tipc_receive_from_sock(con))
 			break;
 
 		/* Don't flood Rx machine */
@@ -640,10 +506,113 @@ static void tipc_send_work(struct work_struct *work)
 	conn_put(con);
 }
 
-static void tipc_work_stop(struct tipc_server *s)
+static void tipc_accept_from_sock(struct work_struct *work)
 {
-	destroy_workqueue(s->rcv_wq);
-	destroy_workqueue(s->send_wq);
+	struct tipc_server *srv = container_of(work, struct tipc_server, awork);
+	struct socket *lsock = srv->listener;
+	struct socket *newsock;
+	struct tipc_conn *con;
+	struct sock *newsk;
+	int ret;
+
+	while (1) {
+		ret = kernel_accept(lsock, &newsock, O_NONBLOCK);
+		if (ret < 0)
+			return;
+		con = tipc_alloc_conn(srv);
+		if (IS_ERR(con)) {
+			ret = PTR_ERR(con);
+			sock_release(newsock);
+			return;
+		}
+		/* Register callbacks */
+		newsk = newsock->sk;
+		write_lock_bh(&newsk->sk_callback_lock);
+		newsk->sk_data_ready = sock_data_ready;
+		newsk->sk_write_space = sock_write_space;
+		newsk->sk_user_data = con;
+		con->sock = newsock;
+		write_unlock_bh(&newsk->sk_callback_lock);
+
+		/* Wake up receive process in case of 'SYN+' message */
+		newsk->sk_data_ready(newsk);
+	}
+}
+
+/* listener_sock_data_ready - interrupt callback indicating new connection
+ * The queued job is launched into tipc_accept_from_sock()
+ */
+static void listener_sock_data_ready(struct sock *sk)
+{
+	struct tipc_server *srv;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	srv = sk->sk_user_data;
+	if (srv->listener)
+		queue_work(srv->rcv_wq, &srv->awork);
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int tipc_create_listener_sock(struct tipc_server *srv)
+{
+	int imp = TIPC_CRITICAL_IMPORTANCE;
+	struct socket *lsock = NULL;
+	struct sockaddr_tipc saddr;
+	struct sock *sk;
+	int rc;
+
+	rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock);
+	if (rc < 0)
+		return rc;
+
+	srv->listener = lsock;
+	sk = lsock->sk;
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_data_ready = listener_sock_data_ready;
+	sk->sk_user_data = srv;
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE,
+			       (char *)&imp, sizeof(imp));
+	if (rc < 0)
+		goto err;
+
+	saddr.family	                = AF_TIPC;
+	saddr.addrtype		        = TIPC_ADDR_NAMESEQ;
+	saddr.addr.nameseq.type	        = TIPC_TOP_SRV;
+	saddr.addr.nameseq.lower	= TIPC_TOP_SRV;
+	saddr.addr.nameseq.upper	= TIPC_TOP_SRV;
+	saddr.scope			= TIPC_NODE_SCOPE;
+
+	rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr));
+	if (rc < 0)
+		goto err;
+	rc = kernel_listen(lsock, 0);
+	if (rc < 0)
+		goto err;
+
+	/* As server's listening socket owner and creator is the same module,
+	 * we have to decrease TIPC module reference count to guarantee that
+	 * it remains zero after the server socket is created, otherwise,
+	 * executing "rmmod" command is unable to make TIPC module deleted
+	 * after TIPC module is inserted successfully.
+	 *
+	 * However, the reference count is ever increased twice in
+	 * sock_create_kern(): one is to increase the reference count of owner
+	 * of TIPC socket's proto_ops struct; another is to increment the
+	 * reference count of owner of TIPC proto struct. Therefore, we must
+	 * decrement the module reference count twice to ensure that it keeps
+	 * zero after server's listening socket is created. Of course, we
+	 * must bump the module reference count twice as well before the socket
+	 * is closed.
+	 */
+	module_put(lsock->ops->owner);
+	module_put(sk->sk_prot_creator->owner);
+
+	return 0;
+err:
+	sock_release(lsock);
+	return -EINVAL;
 }
 
 static int tipc_work_start(struct tipc_server *s)
@@ -664,32 +633,26 @@ static int tipc_work_start(struct tipc_server *s)
 	return 0;
 }
 
+static void tipc_work_stop(struct tipc_server *s)
+{
+	destroy_workqueue(s->rcv_wq);
+	destroy_workqueue(s->send_wq);
+}
+
 int tipc_topsrv_start(struct net *net)
 {
 	struct tipc_net *tn = tipc_net(net);
 	const char name[] = "topology_server";
-	struct sockaddr_tipc *saddr;
 	struct tipc_server *srv;
 	int ret;
 
-	saddr = kzalloc(sizeof(*saddr), GFP_ATOMIC);
-	if (!saddr)
-		return -ENOMEM;
-	saddr->family			= AF_TIPC;
-	saddr->addrtype			= TIPC_ADDR_NAMESEQ;
-	saddr->addr.nameseq.type	= TIPC_TOP_SRV;
-	saddr->addr.nameseq.lower	= TIPC_TOP_SRV;
-	saddr->addr.nameseq.upper	= TIPC_TOP_SRV;
-	saddr->scope			= TIPC_NODE_SCOPE;
-
 	srv = kzalloc(sizeof(*srv), GFP_ATOMIC);
-	if (!srv) {
-		kfree(saddr);
+	if (!srv)
 		return -ENOMEM;
-	}
-	srv->net			= net;
-	srv->saddr			= saddr;
-	srv->max_rcvbuf_size		= sizeof(struct tipc_subscr);
+
+	srv->net = net;
+	srv->max_rcvbuf_size = sizeof(struct tipc_subscr);
+	INIT_WORK(&srv->awork, tipc_accept_from_sock);
 
 	strncpy(srv->name, name, strlen(name) + 1);
 	tn->topsrv = srv;
@@ -703,7 +666,7 @@ int tipc_topsrv_start(struct net *net)
 	if (ret < 0)
 		return ret;
 
-	ret = tipc_open_listening_sock(srv);
+	ret = tipc_create_listener_sock(srv);
 	if (ret < 0)
 		tipc_work_stop(srv);
 
@@ -713,6 +676,7 @@ int tipc_topsrv_start(struct net *net)
 void tipc_topsrv_stop(struct net *net)
 {
 	struct tipc_server *srv = tipc_topsrv(net);
+	struct socket *lsock = srv->listener;
 	struct tipc_conn *con;
 	int id;
 
@@ -725,10 +689,12 @@ void tipc_topsrv_stop(struct net *net)
 			spin_lock_bh(&srv->idr_lock);
 		}
 	}
+	__module_get(lsock->ops->owner);
+	__module_get(lsock->sk->sk_prot_creator->owner);
+	sock_release(lsock);
+	srv->listener = NULL;
 	spin_unlock_bh(&srv->idr_lock);
-
 	tipc_work_stop(srv);
 	idr_destroy(&srv->conn_idr);
-	kfree(srv->saddr);
 	kfree(srv);
 }
-- 
cgit v1.2.3


From 026321c6d056a54b4145522492245d2b5913ee1d Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Feb 2018 10:40:51 +0100
Subject: tipc: rename tipc_server to tipc_topsrv

We rename struct tipc_server to struct tipc_topsrv. This reflect its now
specialized role as topology server. Accoringly, we change or add function
prefixes to make it clearer which functionality those belong to.

There are no functional changes in this commit.

Acked-by: Ying.Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/Makefile |   2 +-
 net/tipc/core.h   |   6 +-
 net/tipc/group.c  |   2 +-
 net/tipc/server.c | 700 -----------------------------------------------------
 net/tipc/server.h |  57 -----
 net/tipc/subscr.c |   2 +-
 net/tipc/subscr.h |   2 +-
 net/tipc/topsrv.c | 702 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/tipc/topsrv.h |  54 +++++
 9 files changed, 763 insertions(+), 764 deletions(-)
 delete mode 100644 net/tipc/server.c
 delete mode 100644 net/tipc/server.h
 create mode 100644 net/tipc/topsrv.c
 create mode 100644 net/tipc/topsrv.h

(limited to 'net')

diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index 37bb0bfbd936..1edb7192aa2f 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -9,7 +9,7 @@ tipc-y	+= addr.o bcast.o bearer.o \
 	   core.o link.o discover.o msg.o  \
 	   name_distr.o  subscr.o monitor.o name_table.o net.o  \
 	   netlink.o netlink_compat.o node.o socket.o eth_media.o \
-	   server.o socket.o group.o
+	   topsrv.o socket.o group.o
 
 tipc-$(CONFIG_TIPC_MEDIA_UDP)	+= udp_media.o
 tipc-$(CONFIG_TIPC_MEDIA_IB)	+= ib_media.o
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 20b21af2ff14..ff8b071654f5 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -64,7 +64,7 @@ struct tipc_bearer;
 struct tipc_bc_base;
 struct tipc_link;
 struct tipc_name_table;
-struct tipc_server;
+struct tipc_topsrv;
 struct tipc_monitor;
 
 #define TIPC_MOD_VER "2.0.0"
@@ -112,7 +112,7 @@ struct tipc_net {
 	struct list_head dist_queue;
 
 	/* Topology subscription server */
-	struct tipc_server *topsrv;
+	struct tipc_topsrv *topsrv;
 	atomic_t subscription_count;
 };
 
@@ -131,7 +131,7 @@ static inline struct list_head *tipc_nodes(struct net *net)
 	return &tipc_net(net)->node_list;
 }
 
-static inline struct tipc_server *tipc_topsrv(struct net *net)
+static inline struct tipc_topsrv *tipc_topsrv(struct net *net)
 {
 	return tipc_net(net)->topsrv;
 }
diff --git a/net/tipc/group.c b/net/tipc/group.c
index 122162a31816..03086ccb7746 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -37,7 +37,7 @@
 #include "addr.h"
 #include "group.h"
 #include "bcast.h"
-#include "server.h"
+#include "topsrv.h"
 #include "msg.h"
 #include "socket.h"
 #include "node.h"
diff --git a/net/tipc/server.c b/net/tipc/server.c
deleted file mode 100644
index 0e351e81690e..000000000000
--- a/net/tipc/server.c
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * net/tipc/server.c: TIPC server infrastructure
- *
- * Copyright (c) 2012-2013, Wind River Systems
- * Copyright (c) 2017, Ericsson AB
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the names of the copyright holders nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "subscr.h"
-#include "server.h"
-#include "core.h"
-#include "socket.h"
-#include "addr.h"
-#include "msg.h"
-#include <net/sock.h>
-#include <linux/module.h>
-
-/* Number of messages to send before rescheduling */
-#define MAX_SEND_MSG_COUNT	25
-#define MAX_RECV_MSG_COUNT	25
-#define CF_CONNECTED		1
-#define CF_SERVER		2
-
-#define TIPC_SERVER_NAME_LEN	32
-
-/**
- * struct tipc_server - TIPC server structure
- * @conn_idr: identifier set of connection
- * @idr_lock: protect the connection identifier set
- * @idr_in_use: amount of allocated identifier entry
- * @net: network namspace instance
- * @rcvbuf_cache: memory cache of server receive buffer
- * @rcv_wq: receive workqueue
- * @send_wq: send workqueue
- * @max_rcvbuf_size: maximum permitted receive message length
- * @tipc_conn_new: callback will be called when new connection is incoming
- * @tipc_conn_release: callback will be called before releasing the connection
- * @tipc_conn_recvmsg: callback will be called when message arrives
- * @name: server name
- * @imp: message importance
- * @type: socket type
- */
-struct tipc_server {
-	struct idr conn_idr;
-	spinlock_t idr_lock; /* for idr list */
-	int idr_in_use;
-	struct net *net;
-	struct work_struct awork;
-	struct workqueue_struct *rcv_wq;
-	struct workqueue_struct *send_wq;
-	int max_rcvbuf_size;
-	struct socket *listener;
-	char name[TIPC_SERVER_NAME_LEN];
-};
-
-/**
- * struct tipc_conn - TIPC connection structure
- * @kref: reference counter to connection object
- * @conid: connection identifier
- * @sock: socket handler associated with connection
- * @flags: indicates connection state
- * @server: pointer to connected server
- * @sub_list: lsit to all pertaing subscriptions
- * @sub_lock: lock protecting the subscription list
- * @outqueue_lock: control access to the outqueue
- * @rwork: receive work item
- * @rx_action: what to do when connection socket is active
- * @outqueue: pointer to first outbound message in queue
- * @outqueue_lock: control access to the outqueue
- * @swork: send work item
- */
-struct tipc_conn {
-	struct kref kref;
-	int conid;
-	struct socket *sock;
-	unsigned long flags;
-	struct tipc_server *server;
-	struct list_head sub_list;
-	spinlock_t sub_lock; /* for subscription list */
-	struct work_struct rwork;
-	struct list_head outqueue;
-	spinlock_t outqueue_lock;
-	struct work_struct swork;
-};
-
-/* An entry waiting to be sent */
-struct outqueue_entry {
-	bool inactive;
-	struct tipc_event evt;
-	struct list_head list;
-};
-
-static void tipc_recv_work(struct work_struct *work);
-static void tipc_send_work(struct work_struct *work);
-
-static bool connected(struct tipc_conn *con)
-{
-	return con && test_bit(CF_CONNECTED, &con->flags);
-}
-
-static void tipc_conn_kref_release(struct kref *kref)
-{
-	struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
-	struct tipc_server *s = con->server;
-	struct outqueue_entry *e, *safe;
-
-	spin_lock_bh(&s->idr_lock);
-	idr_remove(&s->conn_idr, con->conid);
-	s->idr_in_use--;
-	spin_unlock_bh(&s->idr_lock);
-	if (con->sock)
-		sock_release(con->sock);
-
-	spin_lock_bh(&con->outqueue_lock);
-	list_for_each_entry_safe(e, safe, &con->outqueue, list) {
-		list_del(&e->list);
-		kfree(e);
-	}
-	spin_unlock_bh(&con->outqueue_lock);
-	kfree(con);
-}
-
-static void conn_put(struct tipc_conn *con)
-{
-	kref_put(&con->kref, tipc_conn_kref_release);
-}
-
-static void conn_get(struct tipc_conn *con)
-{
-	kref_get(&con->kref);
-}
-
-static struct tipc_conn *tipc_conn_lookup(struct tipc_server *s, int conid)
-{
-	struct tipc_conn *con;
-
-	spin_lock_bh(&s->idr_lock);
-	con = idr_find(&s->conn_idr, conid);
-	if (!connected(con) || !kref_get_unless_zero(&con->kref))
-		con = NULL;
-	spin_unlock_bh(&s->idr_lock);
-	return con;
-}
-
-/* sock_data_ready - interrupt callback indicating the socket has data to read
- * The queued work is launched into tipc_recv_work()->tipc_recv_from_sock()
- */
-static void sock_data_ready(struct sock *sk)
-{
-	struct tipc_conn *con;
-
-	read_lock_bh(&sk->sk_callback_lock);
-	con = sk->sk_user_data;
-	if (connected(con)) {
-		conn_get(con);
-		if (!queue_work(con->server->rcv_wq, &con->rwork))
-			conn_put(con);
-	}
-	read_unlock_bh(&sk->sk_callback_lock);
-}
-
-/* sock_write_space - interrupt callback after a sendmsg EAGAIN
- * Indicates that there now is more space in the send buffer
- * The queued work is launched into tipc_send_work()->tipc_send_to_sock()
- */
-static void sock_write_space(struct sock *sk)
-{
-	struct tipc_conn *con;
-
-	read_lock_bh(&sk->sk_callback_lock);
-	con = sk->sk_user_data;
-	if (connected(con)) {
-		conn_get(con);
-		if (!queue_work(con->server->send_wq, &con->swork))
-			conn_put(con);
-	}
-	read_unlock_bh(&sk->sk_callback_lock);
-}
-
-/* tipc_con_delete_sub - delete a specific or all subscriptions
- * for a given subscriber
- */
-static void tipc_con_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)
-{
-	struct list_head *sub_list = &con->sub_list;
-	struct tipc_net *tn = tipc_net(con->server->net);
-	struct tipc_subscription *sub, *tmp;
-
-	spin_lock_bh(&con->sub_lock);
-	list_for_each_entry_safe(sub, tmp, sub_list, sub_list) {
-		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) {
-			tipc_sub_unsubscribe(sub);
-			atomic_dec(&tn->subscription_count);
-		} else if (s) {
-			break;
-		}
-	}
-	spin_unlock_bh(&con->sub_lock);
-}
-
-static void tipc_close_conn(struct tipc_conn *con)
-{
-	struct sock *sk = con->sock->sk;
-	bool disconnect = false;
-
-	write_lock_bh(&sk->sk_callback_lock);
-	disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags);
-
-	if (disconnect) {
-		sk->sk_user_data = NULL;
-		tipc_con_delete_sub(con, NULL);
-	}
-	write_unlock_bh(&sk->sk_callback_lock);
-
-	/* Handle concurrent calls from sending and receiving threads */
-	if (!disconnect)
-		return;
-
-	/* Don't flush pending works, -just let them expire */
-	kernel_sock_shutdown(con->sock, SHUT_RDWR);
-
-	conn_put(con);
-}
-
-static struct tipc_conn *tipc_alloc_conn(struct tipc_server *s)
-{
-	struct tipc_conn *con;
-	int ret;
-
-	con = kzalloc(sizeof(struct tipc_conn), GFP_ATOMIC);
-	if (!con)
-		return ERR_PTR(-ENOMEM);
-
-	kref_init(&con->kref);
-	INIT_LIST_HEAD(&con->outqueue);
-	INIT_LIST_HEAD(&con->sub_list);
-	spin_lock_init(&con->outqueue_lock);
-	spin_lock_init(&con->sub_lock);
-	INIT_WORK(&con->swork, tipc_send_work);
-	INIT_WORK(&con->rwork, tipc_recv_work);
-
-	spin_lock_bh(&s->idr_lock);
-	ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC);
-	if (ret < 0) {
-		kfree(con);
-		spin_unlock_bh(&s->idr_lock);
-		return ERR_PTR(-ENOMEM);
-	}
-	con->conid = ret;
-	s->idr_in_use++;
-	spin_unlock_bh(&s->idr_lock);
-
-	set_bit(CF_CONNECTED, &con->flags);
-	con->server = s;
-
-	return con;
-}
-
-static int tipc_con_rcv_sub(struct tipc_server *srv,
-			    struct tipc_conn *con,
-			    struct tipc_subscr *s)
-{
-	struct tipc_net *tn = tipc_net(srv->net);
-	struct tipc_subscription *sub;
-
-	if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) {
-		tipc_con_delete_sub(con, s);
-		return 0;
-	}
-	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) {
-		pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR);
-		return -1;
-	}
-	sub = tipc_sub_subscribe(srv->net, s, con->conid);
-	if (!sub)
-		return -1;
-	atomic_inc(&tn->subscription_count);
-	spin_lock_bh(&con->sub_lock);
-	list_add(&sub->sub_list, &con->sub_list);
-	spin_unlock_bh(&con->sub_lock);
-	return 0;
-}
-
-static int tipc_receive_from_sock(struct tipc_conn *con)
-{
-	struct tipc_server *srv = con->server;
-	struct sock *sk = con->sock->sk;
-	struct msghdr msg = {};
-	struct tipc_subscr s;
-	struct kvec iov;
-	int ret;
-
-	iov.iov_base = &s;
-	iov.iov_len = sizeof(s);
-	msg.msg_name = NULL;
-	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len);
-	ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT);
-	if (ret == -EWOULDBLOCK)
-		return -EWOULDBLOCK;
-	if (ret > 0) {
-		read_lock_bh(&sk->sk_callback_lock);
-		ret = tipc_con_rcv_sub(srv, con, &s);
-		read_unlock_bh(&sk->sk_callback_lock);
-	}
-	if (ret < 0)
-		tipc_close_conn(con);
-
-	return ret;
-}
-
-/* tipc_conn_queue_evt() - interrupt level call from a subscription instance
- * The queued work is launched into tipc_send_work()->tipc_send_to_sock()
- */
-void tipc_conn_queue_evt(struct net *net, int conid,
-			 u32 event, struct tipc_event *evt)
-{
-	struct tipc_server *srv = tipc_topsrv(net);
-	struct outqueue_entry *e;
-	struct tipc_conn *con;
-
-	con = tipc_conn_lookup(srv, conid);
-	if (!con)
-		return;
-
-	if (!connected(con))
-		goto err;
-
-	e = kmalloc(sizeof(*e), GFP_ATOMIC);
-	if (!e)
-		goto err;
-	e->inactive = (event == TIPC_SUBSCR_TIMEOUT);
-	memcpy(&e->evt, evt, sizeof(*evt));
-	spin_lock_bh(&con->outqueue_lock);
-	list_add_tail(&e->list, &con->outqueue);
-	spin_unlock_bh(&con->outqueue_lock);
-
-	if (queue_work(srv->send_wq, &con->swork))
-		return;
-err:
-	conn_put(con);
-}
-
-bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
-			     u32 upper, u32 filter, int *conid)
-{
-	struct tipc_subscr sub;
-	struct tipc_conn *con;
-	int rc;
-
-	sub.seq.type = type;
-	sub.seq.lower = lower;
-	sub.seq.upper = upper;
-	sub.timeout = TIPC_WAIT_FOREVER;
-	sub.filter = filter;
-	*(u32 *)&sub.usr_handle = port;
-
-	con = tipc_alloc_conn(tipc_topsrv(net));
-	if (IS_ERR(con))
-		return false;
-
-	*conid = con->conid;
-	con->sock = NULL;
-	rc = tipc_con_rcv_sub(tipc_topsrv(net), con, &sub);
-	if (rc < 0)
-		tipc_close_conn(con);
-	return !rc;
-}
-
-void tipc_topsrv_kern_unsubscr(struct net *net, int conid)
-{
-	struct tipc_conn *con;
-
-	con = tipc_conn_lookup(tipc_topsrv(net), conid);
-	if (!con)
-		return;
-
-	test_and_clear_bit(CF_CONNECTED, &con->flags);
-	tipc_con_delete_sub(con, NULL);
-	conn_put(con);
-	conn_put(con);
-}
-
-static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt)
-{
-	u32 port = *(u32 *)&evt->s.usr_handle;
-	u32 self = tipc_own_addr(net);
-	struct sk_buff_head evtq;
-	struct sk_buff *skb;
-
-	skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt),
-			      self, self, port, port, 0);
-	if (!skb)
-		return;
-	msg_set_dest_droppable(buf_msg(skb), true);
-	memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt));
-	skb_queue_head_init(&evtq);
-	__skb_queue_tail(&evtq, skb);
-	tipc_sk_rcv(net, &evtq);
-}
-
-static void tipc_send_to_sock(struct tipc_conn *con)
-{
-	struct list_head *queue = &con->outqueue;
-	struct tipc_server *srv = con->server;
-	struct outqueue_entry *e;
-	struct tipc_event *evt;
-	struct msghdr msg;
-	struct kvec iov;
-	int count = 0;
-	int ret;
-
-	spin_lock_bh(&con->outqueue_lock);
-
-	while (!list_empty(queue)) {
-		e = list_first_entry(queue, struct outqueue_entry, list);
-		evt = &e->evt;
-		spin_unlock_bh(&con->outqueue_lock);
-
-		if (e->inactive)
-			tipc_con_delete_sub(con, &evt->s);
-
-		memset(&msg, 0, sizeof(msg));
-		msg.msg_flags = MSG_DONTWAIT;
-		iov.iov_base = evt;
-		iov.iov_len = sizeof(*evt);
-		msg.msg_name = NULL;
-
-		if (con->sock) {
-			ret = kernel_sendmsg(con->sock, &msg, &iov,
-					     1, sizeof(*evt));
-			if (ret == -EWOULDBLOCK || ret == 0) {
-				cond_resched();
-				return;
-			} else if (ret < 0) {
-				return tipc_close_conn(con);
-			}
-		} else {
-			tipc_send_kern_top_evt(srv->net, evt);
-		}
-
-		/* Don't starve users filling buffers */
-		if (++count >= MAX_SEND_MSG_COUNT) {
-			cond_resched();
-			count = 0;
-		}
-		spin_lock_bh(&con->outqueue_lock);
-		list_del(&e->list);
-		kfree(e);
-	}
-	spin_unlock_bh(&con->outqueue_lock);
-}
-
-static void tipc_recv_work(struct work_struct *work)
-{
-	struct tipc_conn *con = container_of(work, struct tipc_conn, rwork);
-	int count = 0;
-
-	while (connected(con)) {
-		if (tipc_receive_from_sock(con))
-			break;
-
-		/* Don't flood Rx machine */
-		if (++count >= MAX_RECV_MSG_COUNT) {
-			cond_resched();
-			count = 0;
-		}
-	}
-	conn_put(con);
-}
-
-static void tipc_send_work(struct work_struct *work)
-{
-	struct tipc_conn *con = container_of(work, struct tipc_conn, swork);
-
-	if (connected(con))
-		tipc_send_to_sock(con);
-
-	conn_put(con);
-}
-
-static void tipc_accept_from_sock(struct work_struct *work)
-{
-	struct tipc_server *srv = container_of(work, struct tipc_server, awork);
-	struct socket *lsock = srv->listener;
-	struct socket *newsock;
-	struct tipc_conn *con;
-	struct sock *newsk;
-	int ret;
-
-	while (1) {
-		ret = kernel_accept(lsock, &newsock, O_NONBLOCK);
-		if (ret < 0)
-			return;
-		con = tipc_alloc_conn(srv);
-		if (IS_ERR(con)) {
-			ret = PTR_ERR(con);
-			sock_release(newsock);
-			return;
-		}
-		/* Register callbacks */
-		newsk = newsock->sk;
-		write_lock_bh(&newsk->sk_callback_lock);
-		newsk->sk_data_ready = sock_data_ready;
-		newsk->sk_write_space = sock_write_space;
-		newsk->sk_user_data = con;
-		con->sock = newsock;
-		write_unlock_bh(&newsk->sk_callback_lock);
-
-		/* Wake up receive process in case of 'SYN+' message */
-		newsk->sk_data_ready(newsk);
-	}
-}
-
-/* listener_sock_data_ready - interrupt callback indicating new connection
- * The queued job is launched into tipc_accept_from_sock()
- */
-static void listener_sock_data_ready(struct sock *sk)
-{
-	struct tipc_server *srv;
-
-	read_lock_bh(&sk->sk_callback_lock);
-	srv = sk->sk_user_data;
-	if (srv->listener)
-		queue_work(srv->rcv_wq, &srv->awork);
-	read_unlock_bh(&sk->sk_callback_lock);
-}
-
-static int tipc_create_listener_sock(struct tipc_server *srv)
-{
-	int imp = TIPC_CRITICAL_IMPORTANCE;
-	struct socket *lsock = NULL;
-	struct sockaddr_tipc saddr;
-	struct sock *sk;
-	int rc;
-
-	rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock);
-	if (rc < 0)
-		return rc;
-
-	srv->listener = lsock;
-	sk = lsock->sk;
-	write_lock_bh(&sk->sk_callback_lock);
-	sk->sk_data_ready = listener_sock_data_ready;
-	sk->sk_user_data = srv;
-	write_unlock_bh(&sk->sk_callback_lock);
-
-	rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE,
-			       (char *)&imp, sizeof(imp));
-	if (rc < 0)
-		goto err;
-
-	saddr.family	                = AF_TIPC;
-	saddr.addrtype		        = TIPC_ADDR_NAMESEQ;
-	saddr.addr.nameseq.type	        = TIPC_TOP_SRV;
-	saddr.addr.nameseq.lower	= TIPC_TOP_SRV;
-	saddr.addr.nameseq.upper	= TIPC_TOP_SRV;
-	saddr.scope			= TIPC_NODE_SCOPE;
-
-	rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr));
-	if (rc < 0)
-		goto err;
-	rc = kernel_listen(lsock, 0);
-	if (rc < 0)
-		goto err;
-
-	/* As server's listening socket owner and creator is the same module,
-	 * we have to decrease TIPC module reference count to guarantee that
-	 * it remains zero after the server socket is created, otherwise,
-	 * executing "rmmod" command is unable to make TIPC module deleted
-	 * after TIPC module is inserted successfully.
-	 *
-	 * However, the reference count is ever increased twice in
-	 * sock_create_kern(): one is to increase the reference count of owner
-	 * of TIPC socket's proto_ops struct; another is to increment the
-	 * reference count of owner of TIPC proto struct. Therefore, we must
-	 * decrement the module reference count twice to ensure that it keeps
-	 * zero after server's listening socket is created. Of course, we
-	 * must bump the module reference count twice as well before the socket
-	 * is closed.
-	 */
-	module_put(lsock->ops->owner);
-	module_put(sk->sk_prot_creator->owner);
-
-	return 0;
-err:
-	sock_release(lsock);
-	return -EINVAL;
-}
-
-static int tipc_work_start(struct tipc_server *s)
-{
-	s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0);
-	if (!s->rcv_wq) {
-		pr_err("can't start tipc receive workqueue\n");
-		return -ENOMEM;
-	}
-
-	s->send_wq = alloc_ordered_workqueue("tipc_send", 0);
-	if (!s->send_wq) {
-		pr_err("can't start tipc send workqueue\n");
-		destroy_workqueue(s->rcv_wq);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static void tipc_work_stop(struct tipc_server *s)
-{
-	destroy_workqueue(s->rcv_wq);
-	destroy_workqueue(s->send_wq);
-}
-
-int tipc_topsrv_start(struct net *net)
-{
-	struct tipc_net *tn = tipc_net(net);
-	const char name[] = "topology_server";
-	struct tipc_server *srv;
-	int ret;
-
-	srv = kzalloc(sizeof(*srv), GFP_ATOMIC);
-	if (!srv)
-		return -ENOMEM;
-
-	srv->net = net;
-	srv->max_rcvbuf_size = sizeof(struct tipc_subscr);
-	INIT_WORK(&srv->awork, tipc_accept_from_sock);
-
-	strncpy(srv->name, name, strlen(name) + 1);
-	tn->topsrv = srv;
-	atomic_set(&tn->subscription_count, 0);
-
-	spin_lock_init(&srv->idr_lock);
-	idr_init(&srv->conn_idr);
-	srv->idr_in_use = 0;
-
-	ret = tipc_work_start(srv);
-	if (ret < 0)
-		return ret;
-
-	ret = tipc_create_listener_sock(srv);
-	if (ret < 0)
-		tipc_work_stop(srv);
-
-	return ret;
-}
-
-void tipc_topsrv_stop(struct net *net)
-{
-	struct tipc_server *srv = tipc_topsrv(net);
-	struct socket *lsock = srv->listener;
-	struct tipc_conn *con;
-	int id;
-
-	spin_lock_bh(&srv->idr_lock);
-	for (id = 0; srv->idr_in_use; id++) {
-		con = idr_find(&srv->conn_idr, id);
-		if (con) {
-			spin_unlock_bh(&srv->idr_lock);
-			tipc_close_conn(con);
-			spin_lock_bh(&srv->idr_lock);
-		}
-	}
-	__module_get(lsock->ops->owner);
-	__module_get(lsock->sk->sk_prot_creator->owner);
-	sock_release(lsock);
-	srv->listener = NULL;
-	spin_unlock_bh(&srv->idr_lock);
-	tipc_work_stop(srv);
-	idr_destroy(&srv->conn_idr);
-	kfree(srv);
-}
diff --git a/net/tipc/server.h b/net/tipc/server.h
deleted file mode 100644
index ce93b6d68e41..000000000000
--- a/net/tipc/server.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * net/tipc/server.h: Include file for TIPC server code
- *
- * Copyright (c) 2012-2013, Wind River Systems
- * Copyright (c) 2017, Ericsson AB
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the names of the copyright holders nor the names of its
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _TIPC_SERVER_H
-#define _TIPC_SERVER_H
-
-#include "core.h"
-#include <linux/idr.h>
-#include <linux/tipc.h>
-#include <net/net_namespace.h>
-
-#define TIPC_SERVER_NAME_LEN	32
-#define TIPC_SUB_CLUSTER_SCOPE  0x20
-#define TIPC_SUB_NODE_SCOPE     0x40
-#define TIPC_SUB_NO_STATUS      0x80
-
-void tipc_conn_queue_evt(struct net *net, int conid,
-			 u32 event, struct tipc_event *evt);
-
-bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
-			     u32 upper, u32 filter, int *conid);
-void tipc_topsrv_kern_unsubscr(struct net *net, int conid);
-
-#endif
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index c8146568d04e..6925a989569b 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -51,7 +51,7 @@ static void tipc_sub_send_event(struct tipc_subscription *sub,
 	tipc_evt_write(evt, found_upper, found_upper);
 	tipc_evt_write(evt, port.ref, port);
 	tipc_evt_write(evt, port.node, node);
-	tipc_conn_queue_evt(sub->net, sub->conid, event, evt);
+	tipc_topsrv_queue_evt(sub->net, sub->conid, event, evt);
 }
 
 /**
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 82ba61afe638..8b2d22b18f22 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -37,7 +37,7 @@
 #ifndef _TIPC_SUBSCR_H
 #define _TIPC_SUBSCR_H
 
-#include "server.h"
+#include "topsrv.h"
 
 #define TIPC_MAX_SUBSCR         65535
 #define TIPC_MAX_PUBLICATIONS   65535
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
new file mode 100644
index 000000000000..02013e00f287
--- /dev/null
+++ b/net/tipc/topsrv.c
@@ -0,0 +1,702 @@
+/*
+ * net/tipc/server.c: TIPC server infrastructure
+ *
+ * Copyright (c) 2012-2013, Wind River Systems
+ * Copyright (c) 2017-2018, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "subscr.h"
+#include "topsrv.h"
+#include "core.h"
+#include "socket.h"
+#include "addr.h"
+#include "msg.h"
+#include <net/sock.h>
+#include <linux/module.h>
+
+/* Number of messages to send before rescheduling */
+#define MAX_SEND_MSG_COUNT	25
+#define MAX_RECV_MSG_COUNT	25
+#define CF_CONNECTED		1
+#define CF_SERVER		2
+
+#define TIPC_SERVER_NAME_LEN	32
+
+/**
+ * struct tipc_topsrv - TIPC server structure
+ * @conn_idr: identifier set of connection
+ * @idr_lock: protect the connection identifier set
+ * @idr_in_use: amount of allocated identifier entry
+ * @net: network namspace instance
+ * @rcvbuf_cache: memory cache of server receive buffer
+ * @rcv_wq: receive workqueue
+ * @send_wq: send workqueue
+ * @max_rcvbuf_size: maximum permitted receive message length
+ * @tipc_conn_new: callback will be called when new connection is incoming
+ * @tipc_conn_release: callback will be called before releasing the connection
+ * @tipc_conn_recvmsg: callback will be called when message arrives
+ * @name: server name
+ * @imp: message importance
+ * @type: socket type
+ */
+struct tipc_topsrv {
+	struct idr conn_idr;
+	spinlock_t idr_lock; /* for idr list */
+	int idr_in_use;
+	struct net *net;
+	struct work_struct awork;
+	struct workqueue_struct *rcv_wq;
+	struct workqueue_struct *send_wq;
+	int max_rcvbuf_size;
+	struct socket *listener;
+	char name[TIPC_SERVER_NAME_LEN];
+};
+
+/**
+ * struct tipc_conn - TIPC connection structure
+ * @kref: reference counter to connection object
+ * @conid: connection identifier
+ * @sock: socket handler associated with connection
+ * @flags: indicates connection state
+ * @server: pointer to connected server
+ * @sub_list: lsit to all pertaing subscriptions
+ * @sub_lock: lock protecting the subscription list
+ * @outqueue_lock: control access to the outqueue
+ * @rwork: receive work item
+ * @rx_action: what to do when connection socket is active
+ * @outqueue: pointer to first outbound message in queue
+ * @outqueue_lock: control access to the outqueue
+ * @swork: send work item
+ */
+struct tipc_conn {
+	struct kref kref;
+	int conid;
+	struct socket *sock;
+	unsigned long flags;
+	struct tipc_topsrv *server;
+	struct list_head sub_list;
+	spinlock_t sub_lock; /* for subscription list */
+	struct work_struct rwork;
+	struct list_head outqueue;
+	spinlock_t outqueue_lock; /* for outqueue */
+	struct work_struct swork;
+};
+
+/* An entry waiting to be sent */
+struct outqueue_entry {
+	bool inactive;
+	struct tipc_event evt;
+	struct list_head list;
+};
+
+static void tipc_conn_recv_work(struct work_struct *work);
+static void tipc_conn_send_work(struct work_struct *work);
+static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt);
+static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s);
+
+static bool connected(struct tipc_conn *con)
+{
+	return con && test_bit(CF_CONNECTED, &con->flags);
+}
+
+static void tipc_conn_kref_release(struct kref *kref)
+{
+	struct tipc_conn *con = container_of(kref, struct tipc_conn, kref);
+	struct tipc_topsrv *s = con->server;
+	struct outqueue_entry *e, *safe;
+
+	spin_lock_bh(&s->idr_lock);
+	idr_remove(&s->conn_idr, con->conid);
+	s->idr_in_use--;
+	spin_unlock_bh(&s->idr_lock);
+	if (con->sock)
+		sock_release(con->sock);
+
+	spin_lock_bh(&con->outqueue_lock);
+	list_for_each_entry_safe(e, safe, &con->outqueue, list) {
+		list_del(&e->list);
+		kfree(e);
+	}
+	spin_unlock_bh(&con->outqueue_lock);
+	kfree(con);
+}
+
+static void conn_put(struct tipc_conn *con)
+{
+	kref_put(&con->kref, tipc_conn_kref_release);
+}
+
+static void conn_get(struct tipc_conn *con)
+{
+	kref_get(&con->kref);
+}
+
+static void tipc_conn_close(struct tipc_conn *con)
+{
+	struct sock *sk = con->sock->sk;
+	bool disconnect = false;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	disconnect = test_and_clear_bit(CF_CONNECTED, &con->flags);
+
+	if (disconnect) {
+		sk->sk_user_data = NULL;
+		tipc_conn_delete_sub(con, NULL);
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	/* Handle concurrent calls from sending and receiving threads */
+	if (!disconnect)
+		return;
+
+	/* Don't flush pending works, -just let them expire */
+	kernel_sock_shutdown(con->sock, SHUT_RDWR);
+
+	conn_put(con);
+}
+
+static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s)
+{
+	struct tipc_conn *con;
+	int ret;
+
+	con = kzalloc(sizeof(*con), GFP_ATOMIC);
+	if (!con)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&con->kref);
+	INIT_LIST_HEAD(&con->outqueue);
+	INIT_LIST_HEAD(&con->sub_list);
+	spin_lock_init(&con->outqueue_lock);
+	spin_lock_init(&con->sub_lock);
+	INIT_WORK(&con->swork, tipc_conn_send_work);
+	INIT_WORK(&con->rwork, tipc_conn_recv_work);
+
+	spin_lock_bh(&s->idr_lock);
+	ret = idr_alloc(&s->conn_idr, con, 0, 0, GFP_ATOMIC);
+	if (ret < 0) {
+		kfree(con);
+		spin_unlock_bh(&s->idr_lock);
+		return ERR_PTR(-ENOMEM);
+	}
+	con->conid = ret;
+	s->idr_in_use++;
+	spin_unlock_bh(&s->idr_lock);
+
+	set_bit(CF_CONNECTED, &con->flags);
+	con->server = s;
+
+	return con;
+}
+
+static struct tipc_conn *tipc_conn_lookup(struct tipc_topsrv *s, int conid)
+{
+	struct tipc_conn *con;
+
+	spin_lock_bh(&s->idr_lock);
+	con = idr_find(&s->conn_idr, conid);
+	if (!connected(con) || !kref_get_unless_zero(&con->kref))
+		con = NULL;
+	spin_unlock_bh(&s->idr_lock);
+	return con;
+}
+
+/* tipc_conn_delete_sub - delete a specific or all subscriptions
+ * for a given subscriber
+ */
+static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s)
+{
+	struct tipc_net *tn = tipc_net(con->server->net);
+	struct list_head *sub_list = &con->sub_list;
+	struct tipc_subscription *sub, *tmp;
+
+	spin_lock_bh(&con->sub_lock);
+	list_for_each_entry_safe(sub, tmp, sub_list, sub_list) {
+		if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) {
+			tipc_sub_unsubscribe(sub);
+			atomic_dec(&tn->subscription_count);
+		} else if (s) {
+			break;
+		}
+	}
+	spin_unlock_bh(&con->sub_lock);
+}
+
+static void tipc_conn_send_to_sock(struct tipc_conn *con)
+{
+	struct list_head *queue = &con->outqueue;
+	struct tipc_topsrv *srv = con->server;
+	struct outqueue_entry *e;
+	struct tipc_event *evt;
+	struct msghdr msg;
+	struct kvec iov;
+	int count = 0;
+	int ret;
+
+	spin_lock_bh(&con->outqueue_lock);
+
+	while (!list_empty(queue)) {
+		e = list_first_entry(queue, struct outqueue_entry, list);
+		evt = &e->evt;
+		spin_unlock_bh(&con->outqueue_lock);
+
+		if (e->inactive)
+			tipc_conn_delete_sub(con, &evt->s);
+
+		memset(&msg, 0, sizeof(msg));
+		msg.msg_flags = MSG_DONTWAIT;
+		iov.iov_base = evt;
+		iov.iov_len = sizeof(*evt);
+		msg.msg_name = NULL;
+
+		if (con->sock) {
+			ret = kernel_sendmsg(con->sock, &msg, &iov,
+					     1, sizeof(*evt));
+			if (ret == -EWOULDBLOCK || ret == 0) {
+				cond_resched();
+				return;
+			} else if (ret < 0) {
+				return tipc_conn_close(con);
+			}
+		} else {
+			tipc_topsrv_kern_evt(srv->net, evt);
+		}
+
+		/* Don't starve users filling buffers */
+		if (++count >= MAX_SEND_MSG_COUNT) {
+			cond_resched();
+			count = 0;
+		}
+		spin_lock_bh(&con->outqueue_lock);
+		list_del(&e->list);
+		kfree(e);
+	}
+	spin_unlock_bh(&con->outqueue_lock);
+}
+
+static void tipc_conn_send_work(struct work_struct *work)
+{
+	struct tipc_conn *con = container_of(work, struct tipc_conn, swork);
+
+	if (connected(con))
+		tipc_conn_send_to_sock(con);
+
+	conn_put(con);
+}
+
+/* tipc_conn_queue_evt() - interrupt level call from a subscription instance
+ * The queued work is launched into tipc_send_work()->tipc_send_to_sock()
+ */
+void tipc_topsrv_queue_evt(struct net *net, int conid,
+			   u32 event, struct tipc_event *evt)
+{
+	struct tipc_topsrv *srv = tipc_topsrv(net);
+	struct outqueue_entry *e;
+	struct tipc_conn *con;
+
+	con = tipc_conn_lookup(srv, conid);
+	if (!con)
+		return;
+
+	if (!connected(con))
+		goto err;
+
+	e = kmalloc(sizeof(*e), GFP_ATOMIC);
+	if (!e)
+		goto err;
+	e->inactive = (event == TIPC_SUBSCR_TIMEOUT);
+	memcpy(&e->evt, evt, sizeof(*evt));
+	spin_lock_bh(&con->outqueue_lock);
+	list_add_tail(&e->list, &con->outqueue);
+	spin_unlock_bh(&con->outqueue_lock);
+
+	if (queue_work(srv->send_wq, &con->swork))
+		return;
+err:
+	conn_put(con);
+}
+
+/* tipc_conn_write_space - interrupt callback after a sendmsg EAGAIN
+ * Indicates that there now is more space in the send buffer
+ * The queued work is launched into tipc_send_work()->tipc_conn_send_to_sock()
+ */
+static void tipc_conn_write_space(struct sock *sk)
+{
+	struct tipc_conn *con;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	con = sk->sk_user_data;
+	if (connected(con)) {
+		conn_get(con);
+		if (!queue_work(con->server->send_wq, &con->swork))
+			conn_put(con);
+	}
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int tipc_conn_rcv_sub(struct tipc_topsrv *srv,
+			     struct tipc_conn *con,
+			     struct tipc_subscr *s)
+{
+	struct tipc_net *tn = tipc_net(srv->net);
+	struct tipc_subscription *sub;
+
+	if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) {
+		tipc_conn_delete_sub(con, s);
+		return 0;
+	}
+	if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCR) {
+		pr_warn("Subscription rejected, max (%u)\n", TIPC_MAX_SUBSCR);
+		return -1;
+	}
+	sub = tipc_sub_subscribe(srv->net, s, con->conid);
+	if (!sub)
+		return -1;
+	atomic_inc(&tn->subscription_count);
+	spin_lock_bh(&con->sub_lock);
+	list_add(&sub->sub_list, &con->sub_list);
+	spin_unlock_bh(&con->sub_lock);
+	return 0;
+}
+
+static int tipc_conn_rcv_from_sock(struct tipc_conn *con)
+{
+	struct tipc_topsrv *srv = con->server;
+	struct sock *sk = con->sock->sk;
+	struct msghdr msg = {};
+	struct tipc_subscr s;
+	struct kvec iov;
+	int ret;
+
+	iov.iov_base = &s;
+	iov.iov_len = sizeof(s);
+	msg.msg_name = NULL;
+	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, iov.iov_len);
+	ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT);
+	if (ret == -EWOULDBLOCK)
+		return -EWOULDBLOCK;
+	if (ret > 0) {
+		read_lock_bh(&sk->sk_callback_lock);
+		ret = tipc_conn_rcv_sub(srv, con, &s);
+		read_unlock_bh(&sk->sk_callback_lock);
+	}
+	if (ret < 0)
+		tipc_conn_close(con);
+
+	return ret;
+}
+
+static void tipc_conn_recv_work(struct work_struct *work)
+{
+	struct tipc_conn *con = container_of(work, struct tipc_conn, rwork);
+	int count = 0;
+
+	while (connected(con)) {
+		if (tipc_conn_rcv_from_sock(con))
+			break;
+
+		/* Don't flood Rx machine */
+		if (++count >= MAX_RECV_MSG_COUNT) {
+			cond_resched();
+			count = 0;
+		}
+	}
+	conn_put(con);
+}
+
+/* tipc_conn_data_ready - interrupt callback indicating the socket has data
+ * The queued work is launched into tipc_recv_work()->tipc_conn_rcv_from_sock()
+ */
+static void tipc_conn_data_ready(struct sock *sk)
+{
+	struct tipc_conn *con;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	con = sk->sk_user_data;
+	if (connected(con)) {
+		conn_get(con);
+		if (!queue_work(con->server->rcv_wq, &con->rwork))
+			conn_put(con);
+	}
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void tipc_topsrv_accept(struct work_struct *work)
+{
+	struct tipc_topsrv *srv = container_of(work, struct tipc_topsrv, awork);
+	struct socket *lsock = srv->listener;
+	struct socket *newsock;
+	struct tipc_conn *con;
+	struct sock *newsk;
+	int ret;
+
+	while (1) {
+		ret = kernel_accept(lsock, &newsock, O_NONBLOCK);
+		if (ret < 0)
+			return;
+		con = tipc_conn_alloc(srv);
+		if (IS_ERR(con)) {
+			ret = PTR_ERR(con);
+			sock_release(newsock);
+			return;
+		}
+		/* Register callbacks */
+		newsk = newsock->sk;
+		write_lock_bh(&newsk->sk_callback_lock);
+		newsk->sk_data_ready = tipc_conn_data_ready;
+		newsk->sk_write_space = tipc_conn_write_space;
+		newsk->sk_user_data = con;
+		con->sock = newsock;
+		write_unlock_bh(&newsk->sk_callback_lock);
+
+		/* Wake up receive process in case of 'SYN+' message */
+		newsk->sk_data_ready(newsk);
+	}
+}
+
+/* tipc_toprsv_listener_data_ready - interrupt callback with connection request
+ * The queued job is launched into tipc_topsrv_accept()
+ */
+static void tipc_topsrv_listener_data_ready(struct sock *sk)
+{
+	struct tipc_topsrv *srv;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	srv = sk->sk_user_data;
+	if (srv->listener)
+		queue_work(srv->rcv_wq, &srv->awork);
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int tipc_topsrv_create_listener(struct tipc_topsrv *srv)
+{
+	int imp = TIPC_CRITICAL_IMPORTANCE;
+	struct socket *lsock = NULL;
+	struct sockaddr_tipc saddr;
+	struct sock *sk;
+	int rc;
+
+	rc = sock_create_kern(srv->net, AF_TIPC, SOCK_SEQPACKET, 0, &lsock);
+	if (rc < 0)
+		return rc;
+
+	srv->listener = lsock;
+	sk = lsock->sk;
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_data_ready = tipc_topsrv_listener_data_ready;
+	sk->sk_user_data = srv;
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE,
+			       (char *)&imp, sizeof(imp));
+	if (rc < 0)
+		goto err;
+
+	saddr.family	                = AF_TIPC;
+	saddr.addrtype		        = TIPC_ADDR_NAMESEQ;
+	saddr.addr.nameseq.type	        = TIPC_TOP_SRV;
+	saddr.addr.nameseq.lower	= TIPC_TOP_SRV;
+	saddr.addr.nameseq.upper	= TIPC_TOP_SRV;
+	saddr.scope			= TIPC_NODE_SCOPE;
+
+	rc = kernel_bind(lsock, (struct sockaddr *)&saddr, sizeof(saddr));
+	if (rc < 0)
+		goto err;
+	rc = kernel_listen(lsock, 0);
+	if (rc < 0)
+		goto err;
+
+	/* As server's listening socket owner and creator is the same module,
+	 * we have to decrease TIPC module reference count to guarantee that
+	 * it remains zero after the server socket is created, otherwise,
+	 * executing "rmmod" command is unable to make TIPC module deleted
+	 * after TIPC module is inserted successfully.
+	 *
+	 * However, the reference count is ever increased twice in
+	 * sock_create_kern(): one is to increase the reference count of owner
+	 * of TIPC socket's proto_ops struct; another is to increment the
+	 * reference count of owner of TIPC proto struct. Therefore, we must
+	 * decrement the module reference count twice to ensure that it keeps
+	 * zero after server's listening socket is created. Of course, we
+	 * must bump the module reference count twice as well before the socket
+	 * is closed.
+	 */
+	module_put(lsock->ops->owner);
+	module_put(sk->sk_prot_creator->owner);
+
+	return 0;
+err:
+	sock_release(lsock);
+	return -EINVAL;
+}
+
+bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
+			     u32 upper, u32 filter, int *conid)
+{
+	struct tipc_subscr sub;
+	struct tipc_conn *con;
+	int rc;
+
+	sub.seq.type = type;
+	sub.seq.lower = lower;
+	sub.seq.upper = upper;
+	sub.timeout = TIPC_WAIT_FOREVER;
+	sub.filter = filter;
+	*(u32 *)&sub.usr_handle = port;
+
+	con = tipc_conn_alloc(tipc_topsrv(net));
+	if (IS_ERR(con))
+		return false;
+
+	*conid = con->conid;
+	con->sock = NULL;
+	rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub);
+	if (rc < 0)
+		tipc_conn_close(con);
+	return !rc;
+}
+
+void tipc_topsrv_kern_unsubscr(struct net *net, int conid)
+{
+	struct tipc_conn *con;
+
+	con = tipc_conn_lookup(tipc_topsrv(net), conid);
+	if (!con)
+		return;
+
+	test_and_clear_bit(CF_CONNECTED, &con->flags);
+	tipc_conn_delete_sub(con, NULL);
+	conn_put(con);
+	conn_put(con);
+}
+
+static void tipc_topsrv_kern_evt(struct net *net, struct tipc_event *evt)
+{
+	u32 port = *(u32 *)&evt->s.usr_handle;
+	u32 self = tipc_own_addr(net);
+	struct sk_buff_head evtq;
+	struct sk_buff *skb;
+
+	skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt),
+			      self, self, port, port, 0);
+	if (!skb)
+		return;
+	msg_set_dest_droppable(buf_msg(skb), true);
+	memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt));
+	skb_queue_head_init(&evtq);
+	__skb_queue_tail(&evtq, skb);
+	tipc_sk_rcv(net, &evtq);
+}
+
+static int tipc_topsrv_work_start(struct tipc_topsrv *s)
+{
+	s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0);
+	if (!s->rcv_wq) {
+		pr_err("can't start tipc receive workqueue\n");
+		return -ENOMEM;
+	}
+
+	s->send_wq = alloc_ordered_workqueue("tipc_send", 0);
+	if (!s->send_wq) {
+		pr_err("can't start tipc send workqueue\n");
+		destroy_workqueue(s->rcv_wq);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void tipc_topsrv_work_stop(struct tipc_topsrv *s)
+{
+	destroy_workqueue(s->rcv_wq);
+	destroy_workqueue(s->send_wq);
+}
+
+int tipc_topsrv_start(struct net *net)
+{
+	struct tipc_net *tn = tipc_net(net);
+	const char name[] = "topology_server";
+	struct tipc_topsrv *srv;
+	int ret;
+
+	srv = kzalloc(sizeof(*srv), GFP_ATOMIC);
+	if (!srv)
+		return -ENOMEM;
+
+	srv->net = net;
+	srv->max_rcvbuf_size = sizeof(struct tipc_subscr);
+	INIT_WORK(&srv->awork, tipc_topsrv_accept);
+
+	strncpy(srv->name, name, strlen(name) + 1);
+	tn->topsrv = srv;
+	atomic_set(&tn->subscription_count, 0);
+
+	spin_lock_init(&srv->idr_lock);
+	idr_init(&srv->conn_idr);
+	srv->idr_in_use = 0;
+
+	ret = tipc_topsrv_work_start(srv);
+	if (ret < 0)
+		return ret;
+
+	ret = tipc_topsrv_create_listener(srv);
+	if (ret < 0)
+		tipc_topsrv_work_stop(srv);
+
+	return ret;
+}
+
+void tipc_topsrv_stop(struct net *net)
+{
+	struct tipc_topsrv *srv = tipc_topsrv(net);
+	struct socket *lsock = srv->listener;
+	struct tipc_conn *con;
+	int id;
+
+	spin_lock_bh(&srv->idr_lock);
+	for (id = 0; srv->idr_in_use; id++) {
+		con = idr_find(&srv->conn_idr, id);
+		if (con) {
+			spin_unlock_bh(&srv->idr_lock);
+			tipc_conn_close(con);
+			spin_lock_bh(&srv->idr_lock);
+		}
+	}
+	__module_get(lsock->ops->owner);
+	__module_get(lsock->sk->sk_prot_creator->owner);
+	sock_release(lsock);
+	srv->listener = NULL;
+	spin_unlock_bh(&srv->idr_lock);
+	tipc_topsrv_work_stop(srv);
+	idr_destroy(&srv->conn_idr);
+	kfree(srv);
+}
diff --git a/net/tipc/topsrv.h b/net/tipc/topsrv.h
new file mode 100644
index 000000000000..c7ea71293748
--- /dev/null
+++ b/net/tipc/topsrv.h
@@ -0,0 +1,54 @@
+/*
+ * net/tipc/server.h: Include file for TIPC server code
+ *
+ * Copyright (c) 2012-2013, Wind River Systems
+ * Copyright (c) 2017, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TIPC_SERVER_H
+#define _TIPC_SERVER_H
+
+#include "core.h"
+
+#define TIPC_SERVER_NAME_LEN	32
+#define TIPC_SUB_CLUSTER_SCOPE  0x20
+#define TIPC_SUB_NODE_SCOPE     0x40
+#define TIPC_SUB_NO_STATUS      0x80
+
+void tipc_topsrv_queue_evt(struct net *net, int conid,
+			   u32 event, struct tipc_event *evt);
+
+bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
+			     u32 upper, u32 filter, int *conid);
+void tipc_topsrv_kern_unsubscr(struct net *net, int conid);
+
+#endif
-- 
cgit v1.2.3


From 66dede2d6b2340235ca212532275446d7bb010fe Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 15 Feb 2018 15:50:57 +0100
Subject: net: sched: fix unbalance in the error path of tca_action_flush()

When tca_action_flush() calls the action walk() and gets an error,
a successful call to nla_nest_start() is not followed by a call to
nla_nest_cancel(). It's harmless, as the skb is freed in the error
path - but it's worth to fix this unbalance.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 4886ea4a7d6e..624995564e5a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -938,8 +938,10 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 		goto out_module_put;
 
 	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
-	if (err <= 0)
+	if (err <= 0) {
+		nla_nest_cancel(skb, nest);
 		goto out_module_put;
+	}
 
 	nla_nest_end(skb, nest);
 
-- 
cgit v1.2.3


From b7b347fa3cd496ad5b4cbcc8ea2931847c4d0d78 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:53 -0500
Subject: net: sched: act: fix code style

This patch is used by subsequent patches. It fixes code style issues
caught by checkpatch.

Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c    | 12 ++++++------
 net/sched/act_mirred.c |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 624995564e5a..a32e6c2edbf6 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -621,7 +621,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 			goto err_out;
 		err = -EINVAL;
 		kind = tb[TCA_ACT_KIND];
-		if (kind == NULL)
+		if (!kind)
 			goto err_out;
 		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
 			goto err_out;
@@ -822,7 +822,7 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (nest == NULL)
+	if (!nest)
 		goto out_nlmsg_trim;
 
 	if (tcf_action_dump(skb, actions, bind, ref) < 0)
@@ -934,7 +934,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (nest == NULL)
+	if (!nest)
 		goto out_module_put;
 
 	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
@@ -1007,10 +1007,10 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 		return ret;
 
 	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
-		if (tb[1] != NULL)
+		if (tb[1])
 			return tca_action_flush(net, tb[1], n, portid);
-		else
-			return -EINVAL;
+
+		return -EINVAL;
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e6ff88f72900..abcd5f12b913 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -80,12 +80,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	bool exists = false;
 	int ret;
 
-	if (nla == NULL)
+	if (!nla)
 		return -EINVAL;
 	ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL);
 	if (ret < 0)
 		return ret;
-	if (tb[TCA_MIRRED_PARMS] == NULL)
+	if (!tb[TCA_MIRRED_PARMS])
 		return -EINVAL;
 	parm = nla_data(tb[TCA_MIRRED_PARMS]);
 
@@ -117,7 +117,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		if (dev == NULL)
+		if (!dev)
 			return -EINVAL;
 		ret = tcf_idr_create(tn, parm->index, est, a,
 				     &act_mirred_ops, bind, true);
-- 
cgit v1.2.3


From 10defbd29e6218c1cab5c217a9d808fc05e3938a Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:54 -0500
Subject: net: sched: act: add extack to init

This patch adds extack to tcf_action_init and tcf_action_init_1
functions. These are necessary to make individual extack handling in
each act implementation.

Based on work by David Ahern <dsahern@gmail.com>

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 17 +++++++++++------
 net/sched/cls_api.c |  4 ++--
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a32e6c2edbf6..662574646256 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -605,7 +605,8 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
 
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind)
+				    char *name, int ovr, int bind,
+				    struct netlink_ext_ack *extack)
 {
 	struct tc_action *a;
 	struct tc_action_ops *a_o;
@@ -726,7 +727,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		    struct nlattr *est, char *name, int ovr, int bind,
-		    struct list_head *actions)
+		    struct list_head *actions, struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
@@ -738,7 +739,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		return err;
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind);
+		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
+					extack);
 		if (IS_ERR(act)) {
 			err = PTR_ERR(act);
 			goto err;
@@ -1062,12 +1064,14 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 }
 
 static int tcf_action_add(struct net *net, struct nlattr *nla,
-			  struct nlmsghdr *n, u32 portid, int ovr)
+			  struct nlmsghdr *n, u32 portid, int ovr,
+			  struct netlink_ext_ack *extack)
 {
 	int ret = 0;
 	LIST_HEAD(actions);
 
-	ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions);
+	ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions,
+			      extack);
 	if (ret)
 		return ret;
 
@@ -1115,7 +1119,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 		if (n->nlmsg_flags & NLM_F_REPLACE)
 			ovr = 1;
 replay:
-		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
+				     extack);
 		if (ret == -EAGAIN)
 			goto replay;
 		break;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2bc1bc23d42e..f21610c5da1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1434,7 +1434,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 		if (exts->police && tb[exts->police]) {
 			act = tcf_action_init_1(net, tp, tb[exts->police],
 						rate_tlv, "police", ovr,
-						TCA_ACT_BIND);
+						TCA_ACT_BIND, extack);
 			if (IS_ERR(act))
 				return PTR_ERR(act);
 
@@ -1447,7 +1447,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 
 			err = tcf_action_init(net, tp, tb[exts->action],
 					      rate_tlv, NULL, ovr, TCA_ACT_BIND,
-					      &actions);
+					      &actions, extack);
 			if (err)
 				return err;
 			list_for_each_entry(act, &actions, list)
-- 
cgit v1.2.3


From ee99b2d8bf4ad6d03046a8c2f25bad7cfd9de64a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 16 Feb 2018 16:03:39 -0500
Subject: net: Revert sched action extack support series.

It was mis-applied and the changes had rejects.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c    | 29 ++++++++++++-----------------
 net/sched/act_mirred.c |  6 +++---
 net/sched/cls_api.c    |  4 ++--
 3 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 662574646256..624995564e5a 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -605,8 +605,7 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
 
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind,
-				    struct netlink_ext_ack *extack)
+				    char *name, int ovr, int bind)
 {
 	struct tc_action *a;
 	struct tc_action_ops *a_o;
@@ -622,7 +621,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 			goto err_out;
 		err = -EINVAL;
 		kind = tb[TCA_ACT_KIND];
-		if (!kind)
+		if (kind == NULL)
 			goto err_out;
 		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
 			goto err_out;
@@ -727,7 +726,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		    struct nlattr *est, char *name, int ovr, int bind,
-		    struct list_head *actions, struct netlink_ext_ack *extack)
+		    struct list_head *actions)
 {
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
@@ -739,8 +738,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		return err;
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
-					extack);
+		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind);
 		if (IS_ERR(act)) {
 			err = PTR_ERR(act);
 			goto err;
@@ -824,7 +822,7 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (!nest)
+	if (nest == NULL)
 		goto out_nlmsg_trim;
 
 	if (tcf_action_dump(skb, actions, bind, ref) < 0)
@@ -936,7 +934,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (!nest)
+	if (nest == NULL)
 		goto out_module_put;
 
 	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
@@ -1009,10 +1007,10 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 		return ret;
 
 	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
-		if (tb[1])
+		if (tb[1] != NULL)
 			return tca_action_flush(net, tb[1], n, portid);
-
-		return -EINVAL;
+		else
+			return -EINVAL;
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
@@ -1064,14 +1062,12 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 }
 
 static int tcf_action_add(struct net *net, struct nlattr *nla,
-			  struct nlmsghdr *n, u32 portid, int ovr,
-			  struct netlink_ext_ack *extack)
+			  struct nlmsghdr *n, u32 portid, int ovr)
 {
 	int ret = 0;
 	LIST_HEAD(actions);
 
-	ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions,
-			      extack);
+	ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions);
 	if (ret)
 		return ret;
 
@@ -1119,8 +1115,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 		if (n->nlmsg_flags & NLM_F_REPLACE)
 			ovr = 1;
 replay:
-		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
-				     extack);
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);
 		if (ret == -EAGAIN)
 			goto replay;
 		break;
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index abcd5f12b913..e6ff88f72900 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -80,12 +80,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	bool exists = false;
 	int ret;
 
-	if (!nla)
+	if (nla == NULL)
 		return -EINVAL;
 	ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL);
 	if (ret < 0)
 		return ret;
-	if (!tb[TCA_MIRRED_PARMS])
+	if (tb[TCA_MIRRED_PARMS] == NULL)
 		return -EINVAL;
 	parm = nla_data(tb[TCA_MIRRED_PARMS]);
 
@@ -117,7 +117,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		if (!dev)
+		if (dev == NULL)
 			return -EINVAL;
 		ret = tcf_idr_create(tn, parm->index, est, a,
 				     &act_mirred_ops, bind, true);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index f21610c5da1a..2bc1bc23d42e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1434,7 +1434,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 		if (exts->police && tb[exts->police]) {
 			act = tcf_action_init_1(net, tp, tb[exts->police],
 						rate_tlv, "police", ovr,
-						TCA_ACT_BIND, extack);
+						TCA_ACT_BIND);
 			if (IS_ERR(act))
 				return PTR_ERR(act);
 
@@ -1447,7 +1447,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 
 			err = tcf_action_init(net, tp, tb[exts->action],
 					      rate_tlv, NULL, ovr, TCA_ACT_BIND,
-					      &actions, extack);
+					      &actions);
 			if (err)
 				return err;
 			list_for_each_entry(act, &actions, list)
-- 
cgit v1.2.3


From 6f89dbce8e1134458de8a8e376acaaca4eee602e Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 15 Feb 2018 10:49:32 -0800
Subject: skbuff: export mm_[un]account_pinned_pages for other modules

RDS would like to use the helper functions for managing pinned pages
added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages")

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09bd89c90a71..1a7485a2cdfa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
-static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
+int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
 {
 	unsigned long max_pg, num_pg, new_pg, old_pg;
 	struct user_struct *user;
@@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(mm_account_pinned_pages);
 
-static void mm_unaccount_pinned_pages(struct mmpin *mmp)
+void mm_unaccount_pinned_pages(struct mmpin *mmp)
 {
 	if (mmp->user) {
 		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
 		free_uid(mmp->user);
 	}
 }
+EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
 
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 {
-- 
cgit v1.2.3


From ea8994cb0118993da179af836cc72a583d75dc4b Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 15 Feb 2018 10:49:33 -0800
Subject: rds: hold a sock ref from rds_message to the rds_sock

The existing model holds a reference from the rds_sock to the
rds_message, but the rds_message does not itself hold a sock_put()
on the rds_sock. Instead the m_rs field in the rds_message is
assigned when the message is queued on the sock, and nulled when
the message is dequeued from the sock.

We want to be able to notify userspace when the rds_message
is actually freed (from rds_message_purge(), after the refcounts
to the rds_message go to 0). At the time that rds_message_purge()
is called, the message is no longer on the rds_sock retransmit
queue. Thus the explicit reference for the m_rs is needed to
send a notification that will signal to userspace that
it is now safe to free/reuse any pages that may have
been pinned down for zerocopy.

This patch manages the m_rs assignment in the rds_message with
the necessary refcount book-keeping.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/message.c | 8 +++++++-
 net/rds/send.c    | 7 +------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 4318cc9b78f7..ef3daafa3d79 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -58,7 +58,7 @@ EXPORT_SYMBOL_GPL(rds_message_addref);
  */
 static void rds_message_purge(struct rds_message *rm)
 {
-	unsigned long i;
+	unsigned long i, flags;
 
 	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
 		return;
@@ -69,6 +69,12 @@ static void rds_message_purge(struct rds_message *rm)
 		__free_page(sg_page(&rm->data.op_sg[i]));
 	}
 	rm->data.op_nents = 0;
+	spin_lock_irqsave(&rm->m_rs_lock, flags);
+	if (rm->m_rs) {
+		sock_put(rds_rs_to_sk(rm->m_rs));
+		rm->m_rs = NULL;
+	}
+	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 	if (rm->rdma.op_active)
 		rds_rdma_free_op(&rm->rdma);
diff --git a/net/rds/send.c b/net/rds/send.c
index b1b0022b8370..e8f3ff471b15 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -649,7 +649,6 @@ static void rds_send_remove_from_sock(struct list_head *messages, int status)
 				rm->rdma.op_notifier = NULL;
 			}
 			was_on_sock = 1;
-			rm->m_rs = NULL;
 		}
 		spin_unlock(&rs->rs_lock);
 
@@ -756,9 +755,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 		 */
 		if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
 			spin_unlock_irqrestore(&cp->cp_lock, flags);
-			spin_lock_irqsave(&rm->m_rs_lock, flags);
-			rm->m_rs = NULL;
-			spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 			continue;
 		}
 		list_del_init(&rm->m_conn_item);
@@ -774,7 +770,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
 		spin_unlock(&rs->rs_lock);
 
-		rm->m_rs = NULL;
 		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 		rds_message_put(rm);
@@ -798,7 +793,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
 		__rds_send_complete(rs, rm, RDS_RDMA_CANCELED);
 		spin_unlock(&rs->rs_lock);
 
-		rm->m_rs = NULL;
 		spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
 		rds_message_put(rm);
@@ -849,6 +843,7 @@ static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
 		list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
 		set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
 		rds_message_addref(rm);
+		sock_hold(rds_rs_to_sk(rs));
 		rm->m_rs = rs;
 
 		/* The code ordering is a little weird, but we're
-- 
cgit v1.2.3


From 28190752c709272de3c2b6b092029da3f1614c5a Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 15 Feb 2018 10:49:34 -0800
Subject: sock: permit SO_ZEROCOPY on PF_RDS socket

allow the application to set SO_ZEROCOPY on the underlying sk
of a PF_RDS socket

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index e90d461748f0..a1fa4a548f1b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1049,18 +1049,21 @@ set_rcvbuf:
 		break;
 
 	case SO_ZEROCOPY:
-		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
+			if (sk->sk_protocol != IPPROTO_TCP)
+				ret = -ENOTSUPP;
+			else if (sk->sk_state != TCP_CLOSE)
+				ret = -EBUSY;
+		} else if (sk->sk_family != PF_RDS) {
 			ret = -ENOTSUPP;
-		else if (sk->sk_protocol != IPPROTO_TCP)
-			ret = -ENOTSUPP;
-		else if (sk->sk_state != TCP_CLOSE)
-			ret = -EBUSY;
-		else if (val < 0 || val > 1)
-			ret = -EINVAL;
-		else
-			sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
-		break;
-
+		}
+		if (!ret) {
+			if (val < 0 || val > 1)
+				ret = -EINVAL;
+			else
+				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+			break;
+		}
 	default:
 		ret = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From 01883eda72bd3f0a6c81447e4f223de14033fd9d Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 15 Feb 2018 10:49:35 -0800
Subject: rds: support for zcopy completion notification

RDS removes a datagram (rds_message) from the retransmit queue when
an ACK is received. The ACK indicates that the receiver has queued
the RDS datagram, so that the sender can safely forget the datagram.
When all references to the rds_message are quiesced, rds_message_purge
is called to release resources used by the rds_message

If the datagram to be removed had pinned pages set up, add
an entry to the rs->rs_znotify_queue so that the notifcation
will be sent up via rds_rm_zerocopy_callback() when the
rds_message is eventually freed by rds_message_purge.

rds_rm_zerocopy_callback() attempts to batch the number of cookies
sent with each notification  to a max of SO_EE_ORIGIN_MAX_ZCOOKIES.
This is achieved by checking the tail skb in the sk_error_queue:
if this has room for one more cookie, the cookie from the
current notification is added; else a new skb is added to the
sk_error_queue. Every invocation of rds_rm_zerocopy_callback() will
trigger a ->sk_error_report to notify the application.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/af_rds.c  |  2 ++
 net/rds/message.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 net/rds/rds.h     | 14 ++++++++++
 net/rds/recv.c    |  2 ++
 4 files changed, 94 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 0a8eefd256b3..a937f18896ae 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -182,6 +182,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
 		mask |= (EPOLLIN | EPOLLRDNORM);
 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 		mask |= (EPOLLOUT | EPOLLWRNORM);
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
 	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 
 	/* clear state any time we wake a seen-congested socket */
diff --git a/net/rds/message.c b/net/rds/message.c
index ef3daafa3d79..bf1a656b198a 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -33,6 +33,9 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/errqueue.h>
 
 #include "rds.h"
 
@@ -53,29 +56,95 @@ void rds_message_addref(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_addref);
 
+static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
+{
+	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+	int ncookies;
+	u32 *ptr;
+
+	if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
+		return false;
+	ncookies = serr->ee.ee_data;
+	if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
+		return false;
+	ptr = skb_put(skb, sizeof(u32));
+	*ptr = cookie;
+	serr->ee.ee_data = ++ncookies;
+	return true;
+}
+
+static void rds_rm_zerocopy_callback(struct rds_sock *rs,
+				     struct rds_znotifier *znotif)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	struct sk_buff *skb, *tail;
+	struct sock_exterr_skb *serr;
+	unsigned long flags;
+	struct sk_buff_head *q;
+	u32 cookie = znotif->z_cookie;
+
+	q = &sk->sk_error_queue;
+	spin_lock_irqsave(&q->lock, flags);
+	tail = skb_peek_tail(q);
+
+	if (tail && skb_zcookie_add(tail, cookie)) {
+		spin_unlock_irqrestore(&q->lock, flags);
+		mm_unaccount_pinned_pages(&znotif->z_mmp);
+		consume_skb(rds_skb_from_znotifier(znotif));
+		sk->sk_error_report(sk);
+		return;
+	}
+
+	skb = rds_skb_from_znotifier(znotif);
+	serr = SKB_EXT_ERR(skb);
+	memset(&serr->ee, 0, sizeof(serr->ee));
+	serr->ee.ee_errno = 0;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
+	serr->ee.ee_info = 0;
+	WARN_ON(!skb_zcookie_add(skb, cookie));
+
+	__skb_queue_tail(q, skb);
+
+	spin_unlock_irqrestore(&q->lock, flags);
+	sk->sk_error_report(sk);
+
+	mm_unaccount_pinned_pages(&znotif->z_mmp);
+}
+
 /*
  * This relies on dma_map_sg() not touching sg[].page during merging.
  */
 static void rds_message_purge(struct rds_message *rm)
 {
 	unsigned long i, flags;
+	bool zcopy = false;
 
 	if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
 		return;
 
-	for (i = 0; i < rm->data.op_nents; i++) {
-		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
-		/* XXX will have to put_page for page refs */
-		__free_page(sg_page(&rm->data.op_sg[i]));
-	}
-	rm->data.op_nents = 0;
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 	if (rm->m_rs) {
-		sock_put(rds_rs_to_sk(rm->m_rs));
+		struct rds_sock *rs = rm->m_rs;
+
+		if (rm->data.op_mmp_znotifier) {
+			zcopy = true;
+			rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
+			rm->data.op_mmp_znotifier = NULL;
+		}
+		sock_put(rds_rs_to_sk(rs));
 		rm->m_rs = NULL;
 	}
 	spin_unlock_irqrestore(&rm->m_rs_lock, flags);
 
+	for (i = 0; i < rm->data.op_nents; i++) {
+		/* XXX will have to put_page for page refs */
+		if (!zcopy)
+			__free_page(sg_page(&rm->data.op_sg[i]));
+		else
+			put_page(sg_page(&rm->data.op_sg[i]));
+	}
+	rm->data.op_nents = 0;
+
 	if (rm->rdma.op_active)
 		rds_rdma_free_op(&rm->rdma);
 	if (rm->rdma.op_rdma_mr)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 7301b9b01890..24576bc4a5e9 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -356,6 +356,19 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
 #define RDS_MSG_PAGEVEC		7
 #define RDS_MSG_FLUSH		8
 
+struct rds_znotifier {
+	struct list_head	z_list;
+	struct mmpin		z_mmp;
+	u32			z_cookie;
+};
+
+#define	RDS_ZCOPY_SKB(__skb)	((struct rds_znotifier *)&((__skb)->cb[0]))
+
+static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z)
+{
+	return container_of((void *)z, struct sk_buff, cb);
+}
+
 struct rds_message {
 	refcount_t		m_refcount;
 	struct list_head	m_sock_item;
@@ -436,6 +449,7 @@ struct rds_message {
 			unsigned int		op_count;
 			unsigned int		op_dmasg;
 			unsigned int		op_dmaoff;
+			struct rds_znotifier	*op_mmp_znotifier;
 			struct scatterlist	*op_sg;
 		} data;
 	};
diff --git a/net/rds/recv.c b/net/rds/recv.c
index b25bcfe411ca..b080961464df 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -594,6 +594,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 	if (msg_flags & MSG_OOB)
 		goto out;
+	if (msg_flags & MSG_ERRQUEUE)
+		return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR);
 
 	while (1) {
 		/* If there are pending notifications, do those - and nothing else */
-- 
cgit v1.2.3


From 0cebaccef3acbdfbc2d85880a2efb765d2f4e2e3 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 15 Feb 2018 10:49:36 -0800
Subject: rds: zerocopy Tx support.

If the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
if the SO_ZEROCOPY socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg() are pinned.

The pinning uses the accounting infrastructure added by
Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages")

The payload bytes in the message may not be modified for the
duration that the message has been pinned. A multi-threaded
application using this infrastructure may thus need to be notified
about send-completion so that it can free/reuse the buffers
passed to rds_sendmsg(). Notification of send-completion will
identify each message-buffer by a cookie that the application
must specify as ancillary data to rds_sendmsg().
The ancillary data in this case has cmsg_level == SOL_RDS
and cmsg_type == RDS_CMSG_ZCOPY_COOKIE.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/message.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/rds/rds.h     |  3 ++-
 net/rds/send.c    | 44 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index bf1a656b198a..651834513481 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -341,12 +341,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	return rm;
 }
 
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+			       bool zcopy)
 {
 	unsigned long to_copy, nbytes;
 	unsigned long sg_off;
 	struct scatterlist *sg;
 	int ret = 0;
+	int length = iov_iter_count(from);
 
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
 
@@ -356,6 +358,53 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from)
 	sg = rm->data.op_sg;
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
+	if (zcopy) {
+		int total_copied = 0;
+		struct sk_buff *skb;
+
+		skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
+				GFP_KERNEL);
+		if (!skb)
+			return -ENOMEM;
+		rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
+		if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+					    length)) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		while (iov_iter_count(from)) {
+			struct page *pages;
+			size_t start;
+			ssize_t copied;
+
+			copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+						    1, &start);
+			if (copied < 0) {
+				struct mmpin *mmp;
+				int i;
+
+				for (i = 0; i < rm->data.op_nents; i++)
+					put_page(sg_page(&rm->data.op_sg[i]));
+				mmp = &rm->data.op_mmp_znotifier->z_mmp;
+				mm_unaccount_pinned_pages(mmp);
+				ret = -EFAULT;
+				goto err;
+			}
+			total_copied += copied;
+			iov_iter_advance(from, copied);
+			length -= copied;
+			sg_set_page(sg, pages, copied, start);
+			rm->data.op_nents++;
+			sg++;
+		}
+		WARN_ON_ONCE(length != 0);
+		return ret;
+err:
+		consume_skb(skb);
+		rm->data.op_mmp_znotifier = NULL;
+		return ret;
+	} /* zcopy */
+
 	while (iov_iter_count(from)) {
 		if (!sg_page(sg)) {
 			ret = rds_page_remainder_alloc(sg, iov_iter_count(from),
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 24576bc4a5e9..31cd38852050 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -785,7 +785,8 @@ rds_conn_connecting(struct rds_connection *conn)
 /* message.c */
 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
 struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from);
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+			       bool zcopy);
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
 				 __be16 dport, u64 seq);
diff --git a/net/rds/send.c b/net/rds/send.c
index e8f3ff471b15..028ab598ac1b 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -875,12 +875,13 @@ out:
  * rds_message is getting to be quite complicated, and we'd like to allocate
  * it all in one go. This figures out how big it needs to be up front.
  */
-static int rds_rm_size(struct msghdr *msg, int data_len)
+static int rds_rm_size(struct msghdr *msg, int num_sgs)
 {
 	struct cmsghdr *cmsg;
 	int size = 0;
 	int cmsg_groups = 0;
 	int retval;
+	bool zcopy_cookie = false;
 
 	for_each_cmsghdr(cmsg, msg) {
 		if (!CMSG_OK(msg, cmsg))
@@ -899,6 +900,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 
 			break;
 
+		case RDS_CMSG_ZCOPY_COOKIE:
+			zcopy_cookie = true;
 		case RDS_CMSG_RDMA_DEST:
 		case RDS_CMSG_RDMA_MAP:
 			cmsg_groups |= 2;
@@ -919,7 +922,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 
 	}
 
-	size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist);
+	if ((msg->msg_flags & MSG_ZEROCOPY) && !zcopy_cookie)
+		return -EINVAL;
+
+	size += num_sgs * sizeof(struct scatterlist);
 
 	/* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */
 	if (cmsg_groups == 3)
@@ -928,6 +934,18 @@ static int rds_rm_size(struct msghdr *msg, int data_len)
 	return size;
 }
 
+static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
+			  struct cmsghdr *cmsg)
+{
+	u32 *cookie;
+
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)))
+		return -EINVAL;
+	cookie = CMSG_DATA(cmsg);
+	rm->data.op_mmp_znotifier->z_cookie = *cookie;
+	return 0;
+}
+
 static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			 struct msghdr *msg, int *allocated_mr)
 {
@@ -970,6 +988,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
 			ret = rds_cmsg_atomic(rs, rm, cmsg);
 			break;
 
+		case RDS_CMSG_ZCOPY_COOKIE:
+			ret = rds_cmsg_zcopy(rs, rm, cmsg);
+			break;
+
 		default:
 			return -EINVAL;
 		}
@@ -1040,10 +1062,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 	long timeo = sock_sndtimeo(sk, nonblock);
 	struct rds_conn_path *cpath;
 	size_t total_payload_len = payload_len, rdma_payload_len = 0;
+	bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
+		      sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
+	int num_sgs = ceil(payload_len, PAGE_SIZE);
 
 	/* Mirror Linux UDP mirror of BSD error message compatibility */
 	/* XXX: Perhaps MSG_MORE someday */
-	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
+	if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT | MSG_ZEROCOPY)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -1087,8 +1112,15 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 		goto out;
 	}
 
+	if (zcopy) {
+		if (rs->rs_transport->t_type != RDS_TRANS_TCP) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
+		num_sgs = iov_iter_npages(&msg->msg_iter, INT_MAX);
+	}
 	/* size of rm including all sgs */
-	ret = rds_rm_size(msg, payload_len);
+	ret = rds_rm_size(msg, num_sgs);
 	if (ret < 0)
 		goto out;
 
@@ -1100,12 +1132,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 
 	/* Attach data to the rm */
 	if (payload_len) {
-		rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE));
+		rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs);
 		if (!rm->data.op_sg) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		ret = rds_message_copy_from_user(rm, &msg->msg_iter);
+		ret = rds_message_copy_from_user(rm, &msg->msg_iter, zcopy);
 		if (ret)
 			goto out;
 	}
-- 
cgit v1.2.3


From 1af85155813622767d223af6d4dff283ebeea7a7 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:53 -0500
Subject: net: sched: act: fix code style

This patch is used by subsequent patches. It fixes code style issues
caught by checkpatch.

Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c    | 12 ++++++------
 net/sched/act_mirred.c |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 624995564e5a..a32e6c2edbf6 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -621,7 +621,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 			goto err_out;
 		err = -EINVAL;
 		kind = tb[TCA_ACT_KIND];
-		if (kind == NULL)
+		if (!kind)
 			goto err_out;
 		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
 			goto err_out;
@@ -822,7 +822,7 @@ static int tca_get_fill(struct sk_buff *skb, struct list_head *actions,
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (nest == NULL)
+	if (!nest)
 		goto out_nlmsg_trim;
 
 	if (tcf_action_dump(skb, actions, bind, ref) < 0)
@@ -934,7 +934,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (nest == NULL)
+	if (!nest)
 		goto out_module_put;
 
 	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
@@ -1007,10 +1007,10 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 		return ret;
 
 	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
-		if (tb[1] != NULL)
+		if (tb[1])
 			return tca_action_flush(net, tb[1], n, portid);
-		else
-			return -EINVAL;
+
+		return -EINVAL;
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index e6ff88f72900..abcd5f12b913 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -80,12 +80,12 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	bool exists = false;
 	int ret;
 
-	if (nla == NULL)
+	if (!nla)
 		return -EINVAL;
 	ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL);
 	if (ret < 0)
 		return ret;
-	if (tb[TCA_MIRRED_PARMS] == NULL)
+	if (!tb[TCA_MIRRED_PARMS])
 		return -EINVAL;
 	parm = nla_data(tb[TCA_MIRRED_PARMS]);
 
@@ -117,7 +117,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		if (dev == NULL)
+		if (!dev)
 			return -EINVAL;
 		ret = tcf_idr_create(tn, parm->index, est, a,
 				     &act_mirred_ops, bind, true);
-- 
cgit v1.2.3


From aea0d727899140820a631bac78f36e9d9ef15ef6 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:54 -0500
Subject: net: sched: act: add extack to init

This patch adds extack to tcf_action_init and tcf_action_init_1
functions. These are necessary to make individual extack handling in
each act implementation.

Based on work by David Ahern <dsahern@gmail.com>

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 17 +++++++++++------
 net/sched/cls_api.c |  4 ++--
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a32e6c2edbf6..662574646256 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -605,7 +605,8 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
 
 struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 				    struct nlattr *nla, struct nlattr *est,
-				    char *name, int ovr, int bind)
+				    char *name, int ovr, int bind,
+				    struct netlink_ext_ack *extack)
 {
 	struct tc_action *a;
 	struct tc_action_ops *a_o;
@@ -726,7 +727,7 @@ static void cleanup_a(struct list_head *actions, int ovr)
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		    struct nlattr *est, char *name, int ovr, int bind,
-		    struct list_head *actions)
+		    struct list_head *actions, struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
@@ -738,7 +739,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		return err;
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind);
+		act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind,
+					extack);
 		if (IS_ERR(act)) {
 			err = PTR_ERR(act);
 			goto err;
@@ -1062,12 +1064,14 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 }
 
 static int tcf_action_add(struct net *net, struct nlattr *nla,
-			  struct nlmsghdr *n, u32 portid, int ovr)
+			  struct nlmsghdr *n, u32 portid, int ovr,
+			  struct netlink_ext_ack *extack)
 {
 	int ret = 0;
 	LIST_HEAD(actions);
 
-	ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions);
+	ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions,
+			      extack);
 	if (ret)
 		return ret;
 
@@ -1115,7 +1119,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 		if (n->nlmsg_flags & NLM_F_REPLACE)
 			ovr = 1;
 replay:
-		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr);
+		ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr,
+				     extack);
 		if (ret == -EAGAIN)
 			goto replay;
 		break;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 2bc1bc23d42e..f21610c5da1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1434,7 +1434,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 		if (exts->police && tb[exts->police]) {
 			act = tcf_action_init_1(net, tp, tb[exts->police],
 						rate_tlv, "police", ovr,
-						TCA_ACT_BIND);
+						TCA_ACT_BIND, extack);
 			if (IS_ERR(act))
 				return PTR_ERR(act);
 
@@ -1447,7 +1447,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 
 			err = tcf_action_init(net, tp, tb[exts->action],
 					      rate_tlv, NULL, ovr, TCA_ACT_BIND,
-					      &actions);
+					      &actions, extack);
 			if (err)
 				return err;
 			list_for_each_entry(act, &actions, list)
-- 
cgit v1.2.3


From 84ae017a00776486390e2c0cceb4f717c3f809c1 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:55 -0500
Subject: net: sched: act: handle generic action errors

This patch adds extack support for generic act handling. The extack
will be set deeper to each called function which is not part of netdev
core api.

Based on work by David Ahern <dsahern@gmail.com>

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 93 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 61 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 662574646256..8e77ddd9f0ad 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -617,31 +617,40 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 	int err;
 
 	if (name == NULL) {
-		err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL);
+		err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
 		if (err < 0)
 			goto err_out;
 		err = -EINVAL;
 		kind = tb[TCA_ACT_KIND];
-		if (!kind)
+		if (!kind) {
+			NL_SET_ERR_MSG(extack, "TC action kind must be specified");
 			goto err_out;
-		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ)
+		}
+		if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) {
+			NL_SET_ERR_MSG(extack, "TC action name too long");
 			goto err_out;
+		}
 		if (tb[TCA_ACT_COOKIE]) {
 			int cklen = nla_len(tb[TCA_ACT_COOKIE]);
 
-			if (cklen > TC_COOKIE_MAX_SIZE)
+			if (cklen > TC_COOKIE_MAX_SIZE) {
+				NL_SET_ERR_MSG(extack, "TC cookie size above the maximum");
 				goto err_out;
+			}
 
 			cookie = nla_memdup_cookie(tb);
 			if (!cookie) {
+				NL_SET_ERR_MSG(extack, "No memory to generate TC cookie");
 				err = -ENOMEM;
 				goto err_out;
 			}
 		}
 	} else {
-		err = -EINVAL;
-		if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ)
+		if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) {
+			NL_SET_ERR_MSG(extack, "TC action name too long");
+			err = -EINVAL;
 			goto err_out;
+		}
 	}
 
 	a_o = tc_lookup_action_n(act_name);
@@ -664,6 +673,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 			goto err_mod;
 		}
 #endif
+		NL_SET_ERR_MSG(extack, "Failed to load TC action module");
 		err = -ENOENT;
 		goto err_out;
 	}
@@ -698,6 +708,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 
 			list_add_tail(&a->list, &actions);
 			tcf_action_destroy(&actions, bind);
+			NL_SET_ERR_MSG(extack, "Failed to init TC action chain");
 			return ERR_PTR(err);
 		}
 	}
@@ -734,7 +745,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 	int err;
 	int i;
 
-	err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL);
+	err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
 	if (err < 0)
 		return err;
 
@@ -842,7 +853,8 @@ out_nlmsg_trim:
 
 static int
 tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
-	       struct list_head *actions, int event)
+	       struct list_head *actions, int event,
+	       struct netlink_ext_ack *extack)
 {
 	struct sk_buff *skb;
 
@@ -851,6 +863,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
 		return -ENOBUFS;
 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
 			 0, 0) <= 0) {
+		NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -859,7 +872,8 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
 }
 
 static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
-					  struct nlmsghdr *n, u32 portid)
+					  struct nlmsghdr *n, u32 portid,
+					  struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_ACT_MAX + 1];
 	const struct tc_action_ops *ops;
@@ -867,20 +881,24 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
 	int index;
 	int err;
 
-	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL);
+	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
 	if (err < 0)
 		goto err_out;
 
 	err = -EINVAL;
 	if (tb[TCA_ACT_INDEX] == NULL ||
-	    nla_len(tb[TCA_ACT_INDEX]) < sizeof(index))
+	    nla_len(tb[TCA_ACT_INDEX]) < sizeof(index)) {
+		NL_SET_ERR_MSG(extack, "Invalid TC action index value");
 		goto err_out;
+	}
 	index = nla_get_u32(tb[TCA_ACT_INDEX]);
 
 	err = -EINVAL;
 	ops = tc_lookup_action(tb[TCA_ACT_KIND]);
-	if (!ops) /* could happen in batch of actions */
+	if (!ops) { /* could happen in batch of actions */
+		NL_SET_ERR_MSG(extack, "Specified TC action not found");
 		goto err_out;
+	}
 	err = -ENOENT;
 	if (ops->lookup(net, &a, index) == 0)
 		goto err_mod;
@@ -895,7 +913,8 @@ err_out:
 }
 
 static int tca_action_flush(struct net *net, struct nlattr *nla,
-			    struct nlmsghdr *n, u32 portid)
+			    struct nlmsghdr *n, u32 portid,
+			    struct netlink_ext_ack *extack)
 {
 	struct sk_buff *skb;
 	unsigned char *b;
@@ -909,35 +928,39 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 	int err = -ENOMEM;
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
-	if (!skb) {
-		pr_debug("tca_action_flush: failed skb alloc\n");
+	if (!skb)
 		return err;
-	}
 
 	b = skb_tail_pointer(skb);
 
-	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL);
+	err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, extack);
 	if (err < 0)
 		goto err_out;
 
 	err = -EINVAL;
 	kind = tb[TCA_ACT_KIND];
 	ops = tc_lookup_action(kind);
-	if (!ops) /*some idjot trying to flush unknown action */
+	if (!ops) { /*some idjot trying to flush unknown action */
+		NL_SET_ERR_MSG(extack, "Cannot flush unknown TC action");
 		goto err_out;
+	}
 
 	nlh = nlmsg_put(skb, portid, n->nlmsg_seq, RTM_DELACTION,
 			sizeof(*t), 0);
-	if (!nlh)
+	if (!nlh) {
+		NL_SET_ERR_MSG(extack, "Failed to create TC action flush notification");
 		goto out_module_put;
+	}
 	t = nlmsg_data(nlh);
 	t->tca_family = AF_UNSPEC;
 	t->tca__pad1 = 0;
 	t->tca__pad2 = 0;
 
 	nest = nla_nest_start(skb, TCA_ACT_TAB);
-	if (!nest)
+	if (!nest) {
+		NL_SET_ERR_MSG(extack, "Failed to add new netlink message");
 		goto out_module_put;
+	}
 
 	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
 	if (err <= 0) {
@@ -954,6 +977,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 			     n->nlmsg_flags & NLM_F_ECHO);
 	if (err > 0)
 		return 0;
+	if (err < 0)
+		NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification");
 
 	return err;
 
@@ -966,7 +991,7 @@ err_out:
 
 static int
 tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
-	       u32 portid)
+	       u32 portid, struct netlink_ext_ack *extack)
 {
 	int ret;
 	struct sk_buff *skb;
@@ -977,6 +1002,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 
 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
 			 0, 1) <= 0) {
+		NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes");
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -984,6 +1010,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 	/* now do the delete */
 	ret = tcf_action_destroy(actions, 0);
 	if (ret < 0) {
+		NL_SET_ERR_MSG(extack, "Failed to delete TC action");
 		kfree_skb(skb);
 		return ret;
 	}
@@ -997,26 +1024,27 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 
 static int
 tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
-	      u32 portid, int event)
+	      u32 portid, int event, struct netlink_ext_ack *extack)
 {
 	int i, ret;
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
 	LIST_HEAD(actions);
 
-	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL);
+	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
 	if (ret < 0)
 		return ret;
 
 	if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
 		if (tb[1])
-			return tca_action_flush(net, tb[1], n, portid);
+			return tca_action_flush(net, tb[1], n, portid, extack);
 
+		NL_SET_ERR_MSG(extack, "Invalid netlink attributes while flushing TC action");
 		return -EINVAL;
 	}
 
 	for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
-		act = tcf_action_get_1(net, tb[i], n, portid);
+		act = tcf_action_get_1(net, tb[i], n, portid, extack);
 		if (IS_ERR(act)) {
 			ret = PTR_ERR(act);
 			goto err;
@@ -1026,9 +1054,9 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 	}
 
 	if (event == RTM_GETACTION)
-		ret = tcf_get_notify(net, portid, n, &actions, event);
+		ret = tcf_get_notify(net, portid, n, &actions, event, extack);
 	else { /* delete */
-		ret = tcf_del_notify(net, n, &actions, portid);
+		ret = tcf_del_notify(net, n, &actions, portid, extack);
 		if (ret)
 			goto err;
 		return ret;
@@ -1041,7 +1069,7 @@ err:
 
 static int
 tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
-	       u32 portid)
+	       u32 portid, struct netlink_ext_ack *extack)
 {
 	struct sk_buff *skb;
 	int err = 0;
@@ -1052,6 +1080,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 
 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
 			 RTM_NEWACTION, 0, 0) <= 0) {
+		NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while deleting TC action");
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -1075,7 +1104,7 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 	if (ret)
 		return ret;
 
-	return tcf_add_notify(net, n, &actions, portid);
+	return tcf_add_notify(net, n, &actions, portid, extack);
 }
 
 static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
@@ -1103,7 +1132,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
 		return ret;
 
 	if (tca[TCA_ACT_TAB] == NULL) {
-		pr_notice("tc_ctl_action: received NO action attribs\n");
+		NL_SET_ERR_MSG(extack, "Netlink action attributes missing");
 		return -EINVAL;
 	}
 
@@ -1126,11 +1155,11 @@ replay:
 		break;
 	case RTM_DELACTION:
 		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
-				    portid, RTM_DELACTION);
+				    portid, RTM_DELACTION, extack);
 		break;
 	case RTM_GETACTION:
 		ret = tca_action_gd(net, tca[TCA_ACT_TAB], n,
-				    portid, RTM_GETACTION);
+				    portid, RTM_GETACTION, extack);
 		break;
 	default:
 		BUG();
-- 
cgit v1.2.3


From 589dad6d71a72dd7912e5070c63f6bf1f561b5cf Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:56 -0500
Subject: net: sched: act: add extack to init callback

This patch adds extack support for act init callback api. This
prepares to handle extack support inside each specific act
implementation.

Based on work by David Ahern <dsahern@gmail.com>

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c        | 5 +++--
 net/sched/act_bpf.c        | 2 +-
 net/sched/act_connmark.c   | 3 ++-
 net/sched/act_csum.c       | 2 +-
 net/sched/act_gact.c       | 2 +-
 net/sched/act_ife.c        | 2 +-
 net/sched/act_ipt.c        | 4 ++--
 net/sched/act_mirred.c     | 2 +-
 net/sched/act_nat.c        | 3 ++-
 net/sched/act_pedit.c      | 2 +-
 net/sched/act_police.c     | 3 ++-
 net/sched/act_sample.c     | 2 +-
 net/sched/act_simple.c     | 2 +-
 net/sched/act_skbedit.c    | 2 +-
 net/sched/act_skbmod.c     | 2 +-
 net/sched/act_tunnel_key.c | 2 +-
 net/sched/act_vlan.c       | 2 +-
 17 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 8e77ddd9f0ad..576a0c311e5e 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -680,9 +680,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
 
 	/* backward compatibility for policer */
 	if (name == NULL)
-		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind);
+		err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind,
+				extack);
 	else
-		err = a_o->init(net, nla, est, &a, ovr, bind);
+		err = a_o->init(net, nla, est, &a, ovr, bind, extack);
 	if (err < 0)
 		goto err_mod;
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index b3f2c15affa7..b3ebfa9598e2 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -272,7 +272,7 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog,
 
 static int tcf_bpf_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **act,
-			int replace, int bind)
+			int replace, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 	struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 2b15ba84e0c8..20e0215360b5 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -96,7 +96,8 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = {
 
 static int tcf_connmark_init(struct net *net, struct nlattr *nla,
 			     struct nlattr *est, struct tc_action **a,
-			     int ovr, int bind)
+			     int ovr, int bind,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 	struct nlattr *tb[TCA_CONNMARK_MAX + 1];
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b7ba9b06b147..3b8c48bb2683 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -46,7 +46,7 @@ static struct tc_action_ops act_csum_ops;
 
 static int tcf_csum_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a, int ovr,
-			 int bind)
+			 int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 	struct tcf_csum_params *params_old, *params_new;
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index b56986d41c87..912f3398f1c1 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -56,7 +56,7 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
 
 static int tcf_gact_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind)
+			 int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 	struct nlattr *tb[TCA_GACT_MAX + 1];
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 5954e992685a..e5127d400737 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -447,7 +447,7 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb,
 
 static int tcf_ife_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a,
-			int ovr, int bind)
+			int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 	struct nlattr *tb[TCA_IFE_MAX + 1];
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 06e380ae0928..6894cfa83863 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -193,7 +193,7 @@ err1:
 
 static int tcf_ipt_init(struct net *net, struct nlattr *nla,
 			struct nlattr *est, struct tc_action **a, int ovr,
-			int bind)
+			int bind, struct netlink_ext_ack *extack)
 {
 	return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr,
 			      bind);
@@ -201,7 +201,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla,
 
 static int tcf_xt_init(struct net *net, struct nlattr *nla,
 		       struct nlattr *est, struct tc_action **a, int ovr,
-		       int bind)
+		       int bind, struct netlink_ext_ack *extack)
 {
 	return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr,
 			      bind);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index abcd5f12b913..7ccd4c71179f 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -69,7 +69,7 @@ static struct tc_action_ops act_mirred_ops;
 
 static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a, int ovr,
-			   int bind)
+			   int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 	struct nlattr *tb[TCA_MIRRED_MAX + 1];
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 98c6a4b2f523..7e5ebd7f52a6 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -37,7 +37,8 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
 };
 
 static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
-			struct tc_action **a, int ovr, int bind)
+			struct tc_action **a, int ovr, int bind,
+			struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 	struct nlattr *tb[TCA_NAT_MAX + 1];
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 349beaffb29e..bb2c35ed6f10 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -132,7 +132,7 @@ static int tcf_pedit_key_ex_dump(struct sk_buff *skb,
 
 static int tcf_pedit_init(struct net *net, struct nlattr *nla,
 			  struct nlattr *est, struct tc_action **a,
-			  int ovr, int bind)
+			  int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 	struct nlattr *tb[TCA_PEDIT_MAX + 1];
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 95d3c9097b25..6b6facbe251b 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -74,7 +74,8 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
 
 static int tcf_act_police_init(struct net *net, struct nlattr *nla,
 			       struct nlattr *est, struct tc_action **a,
-			       int ovr, int bind)
+			       int ovr, int bind,
+			       struct netlink_ext_ack *extack)
 {
 	int ret = 0, err;
 	struct nlattr *tb[TCA_POLICE_MAX + 1];
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 1ba0df238756..f4579ceba1f6 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -37,7 +37,7 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = {
 
 static int tcf_sample_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a, int ovr,
-			   int bind)
+			   int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 	struct nlattr *tb[TCA_SAMPLE_MAX + 1];
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 425eac11f6da..b0346347c5f0 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -79,7 +79,7 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = {
 
 static int tcf_simp_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind)
+			 int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 	struct nlattr *tb[TCA_DEF_MAX + 1];
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 5a3f691bb545..7651c9d2182d 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -66,7 +66,7 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
 
 static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 			    struct nlattr *est, struct tc_action **a,
-			    int ovr, int bind)
+			    int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 	struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index fa975262dbac..1266449aa8ea 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -84,7 +84,7 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
 
 static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind)
+			   int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
 	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 0e23aac09ad6..dde01eca7ed0 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -70,7 +70,7 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
 
 static int tunnel_key_init(struct net *net, struct nlattr *nla,
 			   struct nlattr *est, struct tc_action **a,
-			   int ovr, int bind)
+			   int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
 	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index e1a1b3f3983a..6c387310b1b6 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -109,7 +109,7 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
 
 static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 			 struct nlattr *est, struct tc_action **a,
-			 int ovr, int bind)
+			 int ovr, int bind, struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 	struct nlattr *tb[TCA_VLAN_MAX + 1];
-- 
cgit v1.2.3


From 331a9295de23a9428adb7f593d0701d393a2079e Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:57 -0500
Subject: net: sched: act: add extack for lookup callback

This patch adds extack support for act lookup callback api. This
prepares to handle extack support inside each specific act
implementation.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c        | 2 +-
 net/sched/act_bpf.c        | 3 ++-
 net/sched/act_connmark.c   | 3 ++-
 net/sched/act_csum.c       | 3 ++-
 net/sched/act_gact.c       | 3 ++-
 net/sched/act_ife.c        | 3 ++-
 net/sched/act_ipt.c        | 6 ++++--
 net/sched/act_mirred.c     | 3 ++-
 net/sched/act_nat.c        | 3 ++-
 net/sched/act_pedit.c      | 3 ++-
 net/sched/act_police.c     | 3 ++-
 net/sched/act_sample.c     | 3 ++-
 net/sched/act_simple.c     | 3 ++-
 net/sched/act_skbedit.c    | 3 ++-
 net/sched/act_skbmod.c     | 3 ++-
 net/sched/act_tunnel_key.c | 3 ++-
 net/sched/act_vlan.c       | 3 ++-
 17 files changed, 35 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 576a0c311e5e..74ed1e288e57 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -901,7 +901,7 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
 		goto err_out;
 	}
 	err = -ENOENT;
-	if (ops->lookup(net, &a, index) == 0)
+	if (ops->lookup(net, &a, index, extack) == 0)
 		goto err_mod;
 
 	module_put(ops->owner);
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index b3ebfa9598e2..d9654b863347 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -374,7 +374,8 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 20e0215360b5..0504b7600fb6 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -184,7 +184,8 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index,
+			       struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 3b8c48bb2683..bdd17b9ef034 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -638,7 +638,8 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 912f3398f1c1..e1e69e38f4b0 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -208,7 +208,8 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index e5127d400737..0b70fb0cc609 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -831,7 +831,8 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 6894cfa83863..f29af79a2d1f 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -310,7 +310,8 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
@@ -358,7 +359,8 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index,
+			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 7ccd4c71179f..9980c6affb5e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -272,7 +272,8 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 7e5ebd7f52a6..6f6d7667ef9a 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -285,7 +285,8 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index bb2c35ed6f10..308b2680a6d9 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -426,7 +426,8 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index,
+			    struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 6b6facbe251b..1292f880eab0 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -305,7 +305,8 @@ nla_put_failure:
 	return -1;
 }
 
-static int tcf_police_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_police_search(struct net *net, struct tc_action **a, u32 index,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index f4579ceba1f6..22379b2cfd1a 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -209,7 +209,8 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index b0346347c5f0..3ebf71470977 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -177,7 +177,8 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 7651c9d2182d..ff1970e14016 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -215,7 +215,8 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index,
+			      struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 1266449aa8ea..110d7c1f823d 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -239,7 +239,8 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index dde01eca7ed0..65a19928f1a9 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -298,7 +298,8 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
 
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 6c387310b1b6..03dcbbc5ffd2 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -274,7 +274,8 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
 	return tcf_generic_walker(tn, skb, cb, type, ops);
 }
 
-static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index)
+static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
-- 
cgit v1.2.3


From 417801055b8cb4c052e989289ccf24a673178bbc Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:58 -0500
Subject: net: sched: act: add extack for walk callback

This patch adds extack support for act walker callback api. This
prepares to handle extack support inside each specific act
implementation.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c        | 4 ++--
 net/sched/act_bpf.c        | 3 ++-
 net/sched/act_connmark.c   | 3 ++-
 net/sched/act_csum.c       | 3 ++-
 net/sched/act_gact.c       | 3 ++-
 net/sched/act_ife.c        | 3 ++-
 net/sched/act_ipt.c        | 6 ++++--
 net/sched/act_mirred.c     | 3 ++-
 net/sched/act_nat.c        | 3 ++-
 net/sched/act_pedit.c      | 3 ++-
 net/sched/act_police.c     | 3 ++-
 net/sched/act_sample.c     | 3 ++-
 net/sched/act_simple.c     | 3 ++-
 net/sched/act_skbedit.c    | 3 ++-
 net/sched/act_skbmod.c     | 3 ++-
 net/sched/act_tunnel_key.c | 3 ++-
 net/sched/act_vlan.c       | 3 ++-
 17 files changed, 36 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 74ed1e288e57..ab107997b259 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -963,7 +963,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
 		goto out_module_put;
 	}
 
-	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
+	err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops, extack);
 	if (err <= 0) {
 		nla_nest_cancel(skb, nest);
 		goto out_module_put;
@@ -1255,7 +1255,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
 	if (nest == NULL)
 		goto out_module_put;
 
-	ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o);
+	ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o, NULL);
 	if (ret < 0)
 		goto out_module_put;
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index d9654b863347..7e01e2c710c4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -367,7 +367,8 @@ static void tcf_bpf_cleanup(struct tc_action *act)
 
 static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  const struct tc_action_ops *ops)
+			  const struct tc_action_ops *ops,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 0504b7600fb6..cb722da0bb15 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -177,7 +177,8 @@ nla_put_failure:
 
 static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
 			       struct netlink_callback *cb, int type,
-			       const struct tc_action_ops *ops)
+			       const struct tc_action_ops *ops,
+			       struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index bdd17b9ef034..3e8efadb750f 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -631,7 +631,8 @@ static void tcf_csum_cleanup(struct tc_action *a)
 
 static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   const struct tc_action_ops *ops)
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index e1e69e38f4b0..d96ebe4bb65a 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -201,7 +201,8 @@ nla_put_failure:
 
 static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   const struct tc_action_ops *ops)
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 0b70fb0cc609..b777e381e0dd 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -824,7 +824,8 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
 
 static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  const struct tc_action_ops *ops)
+			  const struct tc_action_ops *ops,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index f29af79a2d1f..f33a8cc5dee6 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -303,7 +303,8 @@ nla_put_failure:
 
 static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  const struct tc_action_ops *ops)
+			  const struct tc_action_ops *ops,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
@@ -352,7 +353,8 @@ static struct pernet_operations ipt_net_ops = {
 
 static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
 			 struct netlink_callback *cb, int type,
-			 const struct tc_action_ops *ops)
+			 const struct tc_action_ops *ops,
+			 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 9980c6affb5e..3dcd295ea6a7 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -265,7 +265,8 @@ nla_put_failure:
 
 static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
 			     struct netlink_callback *cb, int type,
-			     const struct tc_action_ops *ops)
+			     const struct tc_action_ops *ops,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 6f6d7667ef9a..67243cdc0588 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -278,7 +278,8 @@ nla_put_failure:
 
 static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
 			  struct netlink_callback *cb, int type,
-			  const struct tc_action_ops *ops)
+			  const struct tc_action_ops *ops,
+			  struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 308b2680a6d9..6d6481f6bffa 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -419,7 +419,8 @@ nla_put_failure:
 
 static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
 			    struct netlink_callback *cb, int type,
-			    const struct tc_action_ops *ops)
+			    const struct tc_action_ops *ops,
+			    struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 1292f880eab0..ff803414a736 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -58,7 +58,8 @@ static struct tc_action_ops act_police_ops;
 
 static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
 				 struct netlink_callback *cb, int type,
-				 const struct tc_action_ops *ops)
+				 const struct tc_action_ops *ops,
+				 struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 22379b2cfd1a..7a2b6a33f239 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -202,7 +202,8 @@ nla_put_failure:
 
 static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
 			     struct netlink_callback *cb, int type,
-			     const struct tc_action_ops *ops)
+			     const struct tc_action_ops *ops,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 3ebf71470977..3f5474d20702 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -170,7 +170,8 @@ nla_put_failure:
 
 static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   const struct tc_action_ops *ops)
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index ff1970e14016..d99b6f1f5181 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -208,7 +208,8 @@ nla_put_failure:
 
 static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
 			      struct netlink_callback *cb, int type,
-			      const struct tc_action_ops *ops)
+			      const struct tc_action_ops *ops,
+			      struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 110d7c1f823d..369ea85d0f02 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -232,7 +232,8 @@ nla_put_failure:
 
 static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
 			     struct netlink_callback *cb, int type,
-			     const struct tc_action_ops *ops)
+			     const struct tc_action_ops *ops,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 65a19928f1a9..bced6fd00d43 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -291,7 +291,8 @@ nla_put_failure:
 
 static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
 			     struct netlink_callback *cb, int type,
-			     const struct tc_action_ops *ops)
+			     const struct tc_action_ops *ops,
+			     struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
 
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 03dcbbc5ffd2..7cf409443d02 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -267,7 +267,8 @@ nla_put_failure:
 
 static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
 			   struct netlink_callback *cb, int type,
-			   const struct tc_action_ops *ops)
+			   const struct tc_action_ops *ops,
+			   struct netlink_ext_ack *extack)
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
-- 
cgit v1.2.3


From b36201455aa0749e8708ef97ed9c1c9ece29a113 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:54:59 -0500
Subject: net: sched: act: handle extack in tcf_generic_walker

This patch adds extack handling for a common used TC act function
"tcf_generic_walker()" to add an extack message on failures.
The tcf_generic_walker() function can fail if get a invalid command
different than DEL and GET. The naming "action" here is wrong, the
correct naming would be command.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c        | 6 ++++--
 net/sched/act_bpf.c        | 2 +-
 net/sched/act_connmark.c   | 2 +-
 net/sched/act_csum.c       | 2 +-
 net/sched/act_gact.c       | 2 +-
 net/sched/act_ife.c        | 2 +-
 net/sched/act_ipt.c        | 4 ++--
 net/sched/act_mirred.c     | 2 +-
 net/sched/act_nat.c        | 2 +-
 net/sched/act_pedit.c      | 2 +-
 net/sched/act_police.c     | 2 +-
 net/sched/act_sample.c     | 2 +-
 net/sched/act_simple.c     | 2 +-
 net/sched/act_skbedit.c    | 2 +-
 net/sched/act_skbmod.c     | 2 +-
 net/sched/act_tunnel_key.c | 2 +-
 net/sched/act_vlan.c       | 2 +-
 17 files changed, 21 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index ab107997b259..1f65d6ada9ff 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -202,7 +202,8 @@ nla_put_failure:
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 		       struct netlink_callback *cb, int type,
-		       const struct tc_action_ops *ops)
+		       const struct tc_action_ops *ops,
+		       struct netlink_ext_ack *extack)
 {
 	struct tcf_idrinfo *idrinfo = tn->idrinfo;
 
@@ -211,7 +212,8 @@ int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
 	} else if (type == RTM_GETACTION) {
 		return tcf_dump_walker(idrinfo, skb, cb);
 	} else {
-		WARN(1, "tcf_generic_walker: unknown action %d\n", type);
+		WARN(1, "tcf_generic_walker: unknown command %d\n", type);
+		NL_SET_ERR_MSG(extack, "tcf_generic_walker: unknown command");
 		return -EINVAL;
 	}
 }
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 7e01e2c710c4..cb3c5d403c88 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -372,7 +372,7 @@ static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, bpf_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index cb722da0bb15..e4b880fa51fe 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -182,7 +182,7 @@ static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, connmark_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 3e8efadb750f..d5c2e528d150 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -636,7 +636,7 @@ static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, csum_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index d96ebe4bb65a..f072bcf33760 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -206,7 +206,7 @@ static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, gact_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index b777e381e0dd..a5994cf0512b 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -829,7 +829,7 @@ static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, ife_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index f33a8cc5dee6..9784629090ad 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -308,7 +308,7 @@ static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, ipt_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index,
@@ -358,7 +358,7 @@ static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, xt_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 3dcd295ea6a7..05c2ebe92eca 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -270,7 +270,7 @@ static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, mirred_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 67243cdc0588..4b5848b6c252 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -283,7 +283,7 @@ static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, nat_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 6d6481f6bffa..094303c27c5e 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -424,7 +424,7 @@ static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, pedit_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index ff803414a736..ff55bd6c7db0 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -63,7 +63,7 @@ static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, police_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 7a2b6a33f239..9765145aaf40 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -207,7 +207,7 @@ static int tcf_sample_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, sample_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 3f5474d20702..8244e221fe4f 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -175,7 +175,7 @@ static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, simp_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index d99b6f1f5181..ddf69fc01bdf 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -213,7 +213,7 @@ static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, skbedit_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 369ea85d0f02..a406f191cb84 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -237,7 +237,7 @@ static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index bced6fd00d43..41ff9d0e5c62 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -296,7 +296,7 @@ static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index,
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 7cf409443d02..71411a255f04 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -272,7 +272,7 @@ static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
 {
 	struct tc_action_net *tn = net_generic(net, vlan_net_id);
 
-	return tcf_generic_walker(tn, skb, cb, type, ops);
+	return tcf_generic_walker(tn, skb, cb, type, ops, extack);
 }
 
 static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index,
-- 
cgit v1.2.3


From 1d4760c75d4f8c7c73f040f0261a09cf1481fdec Mon Sep 17 00:00:00 2001
From: Alexander Aring <aring@mojatatu.com>
Date: Thu, 15 Feb 2018 10:55:00 -0500
Subject: net: sched: act: mirred: add extack support

This patch adds extack support for TC mirred action.

Cc: David Ahern <dsahern@gmail.com>
Signed-off-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_mirred.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 05c2ebe92eca..fd34015331ab 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -80,13 +80,17 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	bool exists = false;
 	int ret;
 
-	if (!nla)
+	if (!nla) {
+		NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
 		return -EINVAL;
-	ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL);
+	}
+	ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, extack);
 	if (ret < 0)
 		return ret;
-	if (!tb[TCA_MIRRED_PARMS])
+	if (!tb[TCA_MIRRED_PARMS]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters");
 		return -EINVAL;
+	}
 	parm = nla_data(tb[TCA_MIRRED_PARMS]);
 
 	exists = tcf_idr_check(tn, parm->index, a, bind);
@@ -102,6 +106,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	default:
 		if (exists)
 			tcf_idr_release(*a, bind);
+		NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
 		return -EINVAL;
 	}
 	if (parm->ifindex) {
@@ -117,8 +122,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (!exists) {
-		if (!dev)
+		if (!dev) {
+			NL_SET_ERR_MSG_MOD(extack, "Specified device does not exist");
 			return -EINVAL;
+		}
 		ret = tcf_idr_create(tn, parm->index, est, a,
 				     &act_mirred_ops, bind, true);
 		if (ret)
-- 
cgit v1.2.3


From 1cbec07649ec8c267cdfb486ab4ee73949974ca4 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 16 Feb 2018 11:03:03 -0800
Subject: net: Only honor ifindex in IP_PKTINFO if non-0

Only allow ifindex from IP_PKTINFO to override SO_BINDTODEVICE settings
if the index is actually set in the message.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 008be04ac1cc..9dca0fb8c482 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -258,7 +258,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
 			if (!ipv6_addr_v4mapped(&src_info->ipi6_addr))
 				return -EINVAL;
-			ipc->oif = src_info->ipi6_ifindex;
+			if (src_info->ipi6_ifindex)
+				ipc->oif = src_info->ipi6_ifindex;
 			ipc->addr = src_info->ipi6_addr.s6_addr32[3];
 			continue;
 		}
@@ -288,7 +289,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
 				return -EINVAL;
 			info = (struct in_pktinfo *)CMSG_DATA(cmsg);
-			ipc->oif = info->ipi_ifindex;
+			if (info->ipi_ifindex)
+				ipc->oif = info->ipi_ifindex;
 			ipc->addr = info->ipi_spec_dst.s_addr;
 			break;
 		}
-- 
cgit v1.2.3


From 7a9b3ec1e19f691f6e69429d851a4084b86e6219 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 31 Jan 2018 23:07:06 +0100
Subject: nl80211: remove unnecessary genlmsg_cancel() calls

If we free the message immediately, there's no reason to
trim it back to the previous size.

Done with spatch:

@@
identifier msg, hdr;
@@
-if (hdr)
-  genlmsg_cancel(msg, hdr);
... when != msg;
 nlmsg_free(msg);

@@
identifier msg, hdr;
@@
-genlmsg_cancel(msg, hdr);
... when != msg;
 nlmsg_free(msg);

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index cc6ec5bab676..412ed8676306 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5847,7 +5847,6 @@ static int nl80211_get_mesh_config(struct sk_buff *skb,
 	return genlmsg_reply(msg, info);
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
  out:
 	nlmsg_free(msg);
 	return -ENOBUFS;
@@ -6328,7 +6327,6 @@ static int nl80211_get_reg_do(struct sk_buff *skb, struct genl_info *info)
 nla_put_failure_rcu:
 	rcu_read_unlock();
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 put_failure:
 	nlmsg_free(msg);
 	return -EMSGSIZE;
@@ -13732,7 +13730,6 @@ void nl80211_common_reg_change_event(enum nl80211_commands cmd_id,
 	return;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -13780,7 +13777,6 @@ static void nl80211_send_mlme_event(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -13868,7 +13864,6 @@ static void nl80211_send_mlme_timeout(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -13944,7 +13939,6 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -13984,7 +13978,6 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14014,7 +14007,6 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14051,7 +14043,6 @@ void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14084,7 +14075,6 @@ void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14125,7 +14115,6 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, const u8 *addr,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 EXPORT_SYMBOL(cfg80211_notify_new_peer_candidate);
@@ -14164,7 +14153,6 @@ void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14219,7 +14207,6 @@ void nl80211_send_beacon_hint_event(struct wiphy *wiphy,
 	return;
 
 nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14265,7 +14252,6 @@ static void nl80211_send_remain_on_chan_event(
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14379,7 +14365,6 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 EXPORT_SYMBOL(cfg80211_conn_failed);
@@ -14416,7 +14401,6 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
 	return true;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 	return true;
 }
@@ -14500,7 +14484,6 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
 	return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 	return -ENOBUFS;
 }
@@ -14544,7 +14527,6 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
@@ -14753,7 +14735,6 @@ static void nl80211_gtk_rekey_notify(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14811,7 +14792,6 @@ nl80211_pmksa_candidate_notify(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14864,7 +14844,6 @@ static void nl80211_ch_switch_notify(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -14946,7 +14925,6 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 
@@ -15041,7 +15019,6 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 EXPORT_SYMBOL(cfg80211_probe_status);
@@ -15086,8 +15063,6 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy,
 
  nla_put_failure:
 	spin_unlock_bh(&rdev->beacon_registrations_lock);
-	if (hdr)
-		genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 EXPORT_SYMBOL(cfg80211_report_obss_beacon);
@@ -15303,7 +15278,6 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
 	return;
 
  nla_put_failure:
-	genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 EXPORT_SYMBOL(cfg80211_tdls_oper_request);
@@ -15448,8 +15422,6 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp)
 	return;
 
  nla_put_failure:
-	if (hdr)
-		genlmsg_cancel(msg, hdr);
 	nlmsg_free(msg);
 }
 EXPORT_SYMBOL(cfg80211_crit_proto_stopped);
-- 
cgit v1.2.3


From db8d93a7a355121d49777c059afbca23c53c8628 Mon Sep 17 00:00:00 2001
From: Srinivas Dasari <dasaris@codeaurora.org>
Date: Fri, 2 Feb 2018 11:15:27 +0200
Subject: nl80211: Fix external_auth check for offloaded authentication

Unfortunately removal of the ext_feature flag in the last revision of
the patch ended up negating the comparison and prevented the command
from being processed (either nl80211_external_auth() or
rdev_external_auth() returns -EOPNOTSUPP). Fix this by adding back the
lost '!'.

Fixes: 40cbfa90218b ("cfg80211/nl80211: Optional authentication offload to userspace")
Signed-off-by: Srinivas Dasari <dasaris@codeaurora.org>
Signed-off-by: Jouni Malinen <jouni@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 412ed8676306..c6f256b29c73 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12484,7 +12484,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
 	struct net_device *dev = info->user_ptr[1];
 	struct cfg80211_external_auth_params params;
 
-	if (rdev->ops->external_auth)
+	if (!rdev->ops->external_auth)
 		return -EOPNOTSUPP;
 
 	if (!info->attrs[NL80211_ATTR_SSID])
-- 
cgit v1.2.3


From 11b05ba34d89808ca99180d2656438a319670c56 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 2 Feb 2018 16:31:42 +0000
Subject: mac80211: remove redundant initialization to pointer 'hdr'

The pointer hrd is being initialized with a value that is never read
and re-assigned a little later, hence the initialization is redundant
and can be removed.

Cleans up clang warning:
net/mac80211/tx.c:1924:24: warning: Value stored to 'hdr' during its
initialization is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 25904af38839..5decd0fb247c 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1921,7 +1921,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
+	struct ieee80211_hdr *hdr;
 	int headroom;
 	bool may_encrypt;
 
-- 
cgit v1.2.3


From c4b50cd31d25c3d17886ffc47ca4a9a12c6dc9bf Mon Sep 17 00:00:00 2001
From: Venkateswara Naralasetty <vnaralas@codeaurora.org>
Date: Tue, 13 Feb 2018 11:03:06 +0530
Subject: cfg80211: send ack_signal to user in probe client response

This patch provides support to get ack signal in probe client response
and in station info from user.

Signed-off-by: Venkateswara Naralasetty <vnaralas@codeaurora.org>
[squash in compilation fixes]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/status.c  | 2 +-
 net/wireless/nl80211.c | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index da7427a41529..d74d44e65bd7 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -486,7 +486,7 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 			if (ieee80211_is_nullfunc(hdr->frame_control) ||
 			    ieee80211_is_qos_nullfunc(hdr->frame_control))
 				cfg80211_probe_status(sdata->dev, hdr->addr1,
-						      cookie, acked,
+						      cookie, acked, 0, false,
 						      GFP_ATOMIC);
 			else
 				cfg80211_mgmt_tx_status(&sdata->wdev, cookie,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c6f256b29c73..050ff61b06a3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4486,6 +4486,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,
 	PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc);
 	PUT_SINFO_U64(BEACON_RX, rx_beacon);
 	PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8);
+	PUT_SINFO(ACK_SIGNAL, ack_signal, u8);
 
 #undef PUT_SINFO
 #undef PUT_SINFO_U64
@@ -14984,7 +14985,8 @@ nla_put_failure:
 EXPORT_SYMBOL(cfg80211_sta_opmode_change_notify);
 
 void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
-			   u64 cookie, bool acked, gfp_t gfp)
+			   u64 cookie, bool acked, s32 ack_signal,
+			   bool is_valid_ack_signal, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -15009,7 +15011,9 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr,
 	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
 	    nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie,
 			      NL80211_ATTR_PAD) ||
-	    (acked && nla_put_flag(msg, NL80211_ATTR_ACK)))
+	    (acked && nla_put_flag(msg, NL80211_ATTR_ACK)) ||
+	    (is_valid_ack_signal && nla_put_s32(msg, NL80211_ATTR_ACK_SIGNAL,
+						ack_signal)))
 		goto nla_put_failure;
 
 	genlmsg_end(msg, hdr);
-- 
cgit v1.2.3


From a78b26fffd2368fcd079802897f4c97f9baea833 Mon Sep 17 00:00:00 2001
From: Venkateswara Naralasetty <vnaralas@codeaurora.org>
Date: Tue, 13 Feb 2018 11:04:46 +0530
Subject: mac80211: Add tx ack signal support in sta info

This allows users to get ack signal strength of
last transmitted frame.

Signed-off-by: Venkateswara Naralasetty <vnaralas@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/sta_info.c |  6 ++++++
 net/mac80211/sta_info.h |  2 ++
 net/mac80211/status.c   | 13 +++++++++++--
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 0c5627f8a104..0bc40c719a55 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2287,6 +2287,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
 		sinfo->filled |= BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT);
 		sinfo->expected_throughput = thr;
 	}
+
+	if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) &&
+	    sta->status_stats.ack_signal_filled) {
+		sinfo->ack_signal = sta->status_stats.last_ack_signal;
+		sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL);
+	}
 }
 
 u32 sta_get_expected_throughput(struct sta_info *sta)
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index cd53619435b6..f64eb86ca64b 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -548,6 +548,8 @@ struct sta_info {
 		u64 msdu_retries[IEEE80211_NUM_TIDS + 1];
 		u64 msdu_failed[IEEE80211_NUM_TIDS + 1];
 		unsigned long last_ack;
+		s8 last_ack_signal;
+		bool ack_signal_filled;
 	} status_stats;
 
 	/* Updated from TX path only, no locking requirements */
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index d74d44e65bd7..743e89c5926c 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -187,9 +187,16 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb)
 	struct ieee80211_mgmt *mgmt = (void *) skb->data;
 	struct ieee80211_local *local = sta->local;
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
+	struct ieee80211_tx_info *txinfo = IEEE80211_SKB_CB(skb);
 
-	if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
+	if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) {
 		sta->status_stats.last_ack = jiffies;
+		if (txinfo->status.is_valid_ack_signal) {
+			sta->status_stats.last_ack_signal =
+					 (s8)txinfo->status.ack_signal;
+			sta->status_stats.ack_signal_filled = true;
+		}
+	}
 
 	if (ieee80211_is_data_qos(mgmt->frame_control)) {
 		struct ieee80211_hdr *hdr = (void *) skb->data;
@@ -486,7 +493,9 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local,
 			if (ieee80211_is_nullfunc(hdr->frame_control) ||
 			    ieee80211_is_qos_nullfunc(hdr->frame_control))
 				cfg80211_probe_status(sdata->dev, hdr->addr1,
-						      cookie, acked, 0, false,
+						      cookie, acked,
+						      info->status.ack_signal,
+						      info->status.is_valid_ack_signal,
 						      GFP_ATOMIC);
 			else
 				cfg80211_mgmt_tx_status(&sdata->wdev, cookie,
-- 
cgit v1.2.3


From 753d525a08f2e18d50da9a909d48ecb7596683b3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:48:04 +0300
Subject: net: Convert inet6_net_ops

init method initializes sysctl defaults, allocates
percpu arrays and creates /proc entries.
exit method reverts the above.

There are no pernet_operations, which are interested
in the above entities of foreign net namespace, so
inet6_net_ops are able to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/af_inet6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c1e292db04db..dbbe04018813 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -857,6 +857,7 @@ static void __net_exit inet6_net_exit(struct net *net)
 static struct pernet_operations inet6_net_ops = {
 	.init = inet6_net_init,
 	.exit = inet6_net_exit,
+	.async = true,
 };
 
 static const struct ipv6_stub ipv6_stub_impl = {
-- 
cgit v1.2.3


From 9c537ca1554e5d7db69a3b606801a2101e02edda Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:48:14 +0300
Subject: net: Convert cfg80211_pernet_ops

This patch finishes converting pernet_operations
registered in net/wireless directory.

These pernet_operations have only exit method,
which moves devices to init_net. This action
is not pernet_operations-specific, and function
cfg80211_switch_netns() may be called all time
during the system life. All necessary protection
against concurrent cfg80211_pernet_exit() is made
by rtnl_lock(). So, cfg80211_pernet_ops is able
to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/wireless/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/wireless/core.c b/net/wireless/core.c
index a6f3cac8c640..670aa229168a 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1340,6 +1340,7 @@ static void __net_exit cfg80211_pernet_exit(struct net *net)
 
 static struct pernet_operations cfg80211_pernet_ops = {
 	.exit = cfg80211_pernet_exit,
+	.async = true,
 };
 
 static int __init cfg80211_init(void)
-- 
cgit v1.2.3


From b01a59a4884ed2d95b8db4741e552a689c18989b Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:48:37 +0300
Subject: net: Convert ip6mr_net_ops

These pernet_operations create and destroy /proc entries,
populate and depopulate net::rules_ops and multiroute table.
All the structures are pernet, and they are not touched
by foreign net pernet_operations. So, it's possible to mark
them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9f6cace9c817..295eb5ecaee5 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1397,6 +1397,7 @@ static void __net_exit ip6mr_net_exit(struct net *net)
 static struct pernet_operations ip6mr_net_ops = {
 	.init = ip6mr_net_init,
 	.exit = ip6mr_net_exit,
+	.async = true,
 };
 
 int __init ip6_mr_init(void)
-- 
cgit v1.2.3


From 1a2e93329dd413585798c122aac0607093b2d379 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:48:57 +0300
Subject: net: Convert icmpv6_sk_ops, ndisc_net_ops and igmp6_net_ops

These pernet_operations create and destroy net::ipv6.icmp_sk
socket, used to send ICMP or error reply.

Nobody can dereference the socket to handle a packet before
net is initialized, as there is no routing; nobody can do
that in parallel with exit, as all of devices are moved
to init_net or destroyed and there are no packets it-flight.
So, it's possible to mark these pernet_operations as async.

The same for ndisc_net_ops and for igmp6_net_ops. The last
one also creates and destroys /proc entries.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c  | 1 +
 net/ipv6/mcast.c | 1 +
 net/ipv6/ndisc.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 6ae5dd3f4d0d..4fa4f1b150a4 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -997,6 +997,7 @@ static void __net_exit icmpv6_sk_exit(struct net *net)
 static struct pernet_operations icmpv6_sk_ops = {
 	.init = icmpv6_sk_init,
 	.exit = icmpv6_sk_exit,
+	.async = true,
 };
 
 int __init icmpv6_init(void)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 9b9d2ff01b35..d9bb933dd5c4 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2997,6 +2997,7 @@ static void __net_exit igmp6_net_exit(struct net *net)
 static struct pernet_operations igmp6_net_ops = {
 	.init = igmp6_net_init,
 	.exit = igmp6_net_exit,
+	.async = true,
 };
 
 int __init igmp6_init(void)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index f61a5b613b52..0a19ce3a6f7f 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1882,6 +1882,7 @@ static void __net_exit ndisc_net_exit(struct net *net)
 static struct pernet_operations ndisc_net_ops = {
 	.init = ndisc_net_init,
 	.exit = ndisc_net_exit,
+	.async = true,
 };
 
 int __init ndisc_init(void)
-- 
cgit v1.2.3


From 509114112d0b71997dbe58d4f2976eeddf8eacc4 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:49:10 +0300
Subject: net: Convert raw6_net_ops, udplite6_net_ops, ipv6_proc_ops,
 if6_proc_net_ops and ip6_route_net_late_ops

These pernet_operations create and destroy /proc entries
and safely may be converted and safely may be mark as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 1 +
 net/ipv6/proc.c     | 1 +
 net/ipv6/raw.c      | 1 +
 net/ipv6/route.c    | 2 ++
 4 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8c17f8d8d5d9..4facfe0b1888 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4257,6 +4257,7 @@ static void __net_exit if6_proc_net_exit(struct net *net)
 static struct pernet_operations if6_proc_net_ops = {
 	.init = if6_proc_net_init,
 	.exit = if6_proc_net_exit,
+	.async = true,
 };
 
 int __init if6_proc_init(void)
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index b67814242f78..b8858c546f41 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -343,6 +343,7 @@ static void __net_exit ipv6_proc_exit_net(struct net *net)
 static struct pernet_operations ipv6_proc_ops = {
 	.init = ipv6_proc_init_net,
 	.exit = ipv6_proc_exit_net,
+	.async = true,
 };
 
 int __init ipv6_misc_proc_init(void)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4c25339b1984..10a4ac4933b7 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1332,6 +1332,7 @@ static void __net_exit raw6_exit_net(struct net *net)
 static struct pernet_operations raw6_net_ops = {
 	.init = raw6_init_net,
 	.exit = raw6_exit_net,
+	.async = true,
 };
 
 int __init raw6_proc_init(void)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f0baae26db8f..1be84ef23f43 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4972,6 +4972,7 @@ static void __net_exit ip6_route_net_exit_late(struct net *net)
 static struct pernet_operations ip6_route_net_ops = {
 	.init = ip6_route_net_init,
 	.exit = ip6_route_net_exit,
+	.async = true,
 };
 
 static int __net_init ipv6_inetpeer_init(struct net *net)
@@ -5002,6 +5003,7 @@ static struct pernet_operations ipv6_inetpeer_ops = {
 static struct pernet_operations ip6_route_net_late_ops = {
 	.init = ip6_route_net_init_late,
 	.exit = ip6_route_net_exit_late,
+	.async = true,
 };
 
 static struct notifier_block ip6_route_dev_notifier = {
-- 
cgit v1.2.3


From 85ca51b2a23969dccb5abff31eb54560db8195ca Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:49:20 +0300
Subject: net: Convert ipv6_inetpeer_ops

net->ipv6.peers is dereferenced in three places via inet_getpeer_v6(),
and it's used to handle skb. All the users of inet_getpeer_v6() do not
look like be able to be called from foreign net pernet_operations, so
we may mark them as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1be84ef23f43..aa709b644945 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4998,6 +4998,7 @@ static void __net_exit ipv6_inetpeer_exit(struct net *net)
 static struct pernet_operations ipv6_inetpeer_ops = {
 	.init	=	ipv6_inetpeer_init,
 	.exit	=	ipv6_inetpeer_exit,
+	.async	=	true,
 };
 
 static struct pernet_operations ip6_route_net_late_ops = {
-- 
cgit v1.2.3


From 7b7dd180b85b9c39c426cfaade73f7d0409c7f85 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:49:31 +0300
Subject: net: Convert fib6_rules_net_ops

These pernet_operations register and unregister
net::ipv6.fib6_rules_ops, which are used for
routing. It looks like there are no pernet_operations,
which send ipv6 packages to another net, so we
are able to mark them as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index b240f24a6e52..95a2c9e8699a 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -368,6 +368,7 @@ static void __net_exit fib6_rules_net_exit(struct net *net)
 static struct pernet_operations fib6_rules_net_ops = {
 	.init = fib6_rules_net_init,
 	.exit = fib6_rules_net_exit,
+	.async = true,
 };
 
 int __init fib6_rules_init(void)
-- 
cgit v1.2.3


From fef65a2c6c3417fcf2fb0bebd8c27355d1b14a12 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:49:40 +0300
Subject: net: Convert tcpv6_net_ops

These pernet_operations create and destroy net::ipv6.tcp_sk
socket, which is used in tcp_v6_send_response() only. It looks
like foreign pernet_operations don't want to set ipv6 connection
inside destroyed net, so this socket may be created in destroyed
in parallel with anything else. inet_twsk_purge() is also safe
for that, as described in patch for tcp_sk_ops. So, it's possible
to mark them as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/tcp_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 883df0ad5bfe..5425d7b100ee 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2007,6 +2007,7 @@ static struct pernet_operations tcpv6_net_ops = {
 	.init	    = tcpv6_net_init,
 	.exit	    = tcpv6_net_exit,
 	.exit_batch = tcpv6_net_exit_batch,
+	.async	    = true,
 };
 
 int __init tcpv6_init(void)
-- 
cgit v1.2.3


From 58708caef56b04da2c91d6aaba49c783f22055c2 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:49:49 +0300
Subject: net: Convert ipv6_sysctl_net_ops

These pernet_operations create and destroy sysctl tables.
They are not touched by another net pernet_operations.
So, it's possible to execute them in parallel with others.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sysctl_net_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index a789a8ac6a64..262f791f1b9b 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -251,6 +251,7 @@ static void __net_exit ipv6_sysctl_net_exit(struct net *net)
 static struct pernet_operations ipv6_sysctl_net_ops = {
 	.init = ipv6_sysctl_net_init,
 	.exit = ipv6_sysctl_net_exit,
+	.async = true,
 };
 
 static struct ctl_table_header *ip6_header;
-- 
cgit v1.2.3


From ac34cb6c0c4d327d69e588d7a42bbc6428e2fdfd Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:49:59 +0300
Subject: net: Convert ping_v6_net_ops

These pernet_operations only register and unregister /proc
entries, so it's possible to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ping.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index d12c55dad7d1..318c6e914234 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -240,6 +240,7 @@ static void __net_init ping_v6_proc_exit_net(struct net *net)
 static struct pernet_operations ping_v6_net_ops = {
 	.init = ping_v6_proc_init_net,
 	.exit = ping_v6_proc_exit_net,
+	.async = true,
 };
 #endif
 
-- 
cgit v1.2.3


From a7852a76f414f69631ed3adc4b001c633829306d Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:50:09 +0300
Subject: net: Convert ip6_flowlabel_net_ops

These pernet_operations create and destroy /proc entries.
ip6_fl_purge() makes almost the same actions as timer
ip6_fl_gc_timer does, and as it can be executed in parallel
with ip6_fl_purge(), two parallel ip6_fl_purge() may be
executed. So, we can mark it async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_flowlabel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 3dab664ff503..6ddf52282894 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -873,6 +873,7 @@ static void __net_exit ip6_flowlabel_net_exit(struct net *net)
 static struct pernet_operations ip6_flowlabel_net_ops = {
 	.init = ip6_flowlabel_proc_init,
 	.exit = ip6_flowlabel_net_exit,
+	.async = true,
 };
 
 int ip6_flowlabel_init(void)
-- 
cgit v1.2.3


From b489141369f78ead6ed540cff29ac1974852cd7f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:50:18 +0300
Subject: net: Convert xfrm6_net_ops

These pernet_operations create sysctl tables and
initialize net::xfrm.xfrm6_dst_ops used for routing.
It doesn't look like another pernet_operations send
ipv6 packets to foreign net namespaces, so it should
be safe to mark the pernet_operations as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/xfrm6_policy.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 09fb44ee3b45..88cd0c90fa81 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -395,6 +395,7 @@ static void __net_exit xfrm6_net_exit(struct net *net)
 static struct pernet_operations xfrm6_net_ops = {
 	.init	= xfrm6_net_init,
 	.exit	= xfrm6_net_exit,
+	.async	= true,
 };
 
 int __init xfrm6_init(void)
-- 
cgit v1.2.3


From d16784d9fb2a91e96374df7cb689f52cef55371b Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:50:28 +0300
Subject: net: Convert fib6_net_ops, ipv6_addr_label_ops and ip6_segments_ops

These pernet_operations register and unregister tables
and lists for packets forwarding. All of the entities
are per-net. Init methods makes simple initializations,
and since net is not visible for foreigners at the time
it is working, it can't race with anything. Exit method
is executed when there are only local devices, and there
mustn't be packets in-flight. Also, it looks like no one
pernet_operations want to send ipv6 packets to foreign
net. The same reasons are for ipv6_addr_label_ops and
ip6_segments_ops. So, we are able to mark all them as
async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrlabel.c | 1 +
 net/ipv6/ip6_fib.c   | 1 +
 net/ipv6/seg6.c      | 1 +
 3 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 1d6ced37ad71..ba2e63633370 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -344,6 +344,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net)
 static struct pernet_operations ipv6_addr_label_ops = {
 	.init = ip6addrlbl_net_init,
 	.exit = ip6addrlbl_net_exit,
+	.async = true,
 };
 
 int __init ipv6_addr_label_init(void)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 92b8d8c75eed..cab95cf3b39f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2160,6 +2160,7 @@ static void fib6_net_exit(struct net *net)
 static struct pernet_operations fib6_net_ops = {
 	.init = fib6_net_init,
 	.exit = fib6_net_exit,
+	.async = true,
 };
 
 int __init fib6_init(void)
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 7f5621d09571..c3f13c3bd8a9 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -395,6 +395,7 @@ static void __net_exit seg6_net_exit(struct net *net)
 static struct pernet_operations ip6_segments_ops = {
 	.init = seg6_net_init,
 	.exit = seg6_net_exit,
+	.async = true,
 };
 
 static const struct genl_ops seg6_genl_ops[] = {
-- 
cgit v1.2.3


From 5fc094f5b8c9b8e5012b7ccf303be3b123563a58 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:50:37 +0300
Subject: net: Convert ip6_frags_ops

Exit methods calls inet_frags_exit_net() with global ip6_frags
as argument. So, after we make the pernet_operations async,
a pair of exit methods may be called to iterate this hash table.
Since there is inet_frag_worker(), which already may work
in parallel with inet_frags_exit_net(), and it can make the same
cleanup, that inet_frags_exit_net() does, it's safe. So we may
mark these pernet_operations as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index afbc000ad4f2..b5da69c83123 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -733,6 +733,7 @@ static void __net_exit ipv6_frags_exit_net(struct net *net)
 static struct pernet_operations ip6_frags_ops = {
 	.init = ipv6_frags_init_net,
 	.exit = ipv6_frags_exit_net,
+	.async = true,
 };
 
 int __init ipv6_frag_init(void)
-- 
cgit v1.2.3


From 4d6b80762b9384db3281f792a71746b388f395be Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:50:45 +0300
Subject: net: Convert ip_tables_net_ops, udplite6_net_ops and xt_net_ops

ip_tables_net_ops and udplite6_net_ops create and destroy /proc entries.
xt_net_ops does nothing.

So, we are able to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_tables.c | 1 +
 net/ipv6/udplite.c             | 1 +
 net/netfilter/x_tables.c       | 1 +
 3 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 9a71f3149507..39a7cf9160e6 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1911,6 +1911,7 @@ static void __net_exit ip_tables_net_exit(struct net *net)
 static struct pernet_operations ip_tables_net_ops = {
 	.init = ip_tables_net_init,
 	.exit = ip_tables_net_exit,
+	.async = true,
 };
 
 static int __init ip_tables_init(void)
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 14ae32bb1f3d..f3839780dc31 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -123,6 +123,7 @@ static void __net_exit udplite6_proc_exit_net(struct net *net)
 static struct pernet_operations udplite6_net_ops = {
 	.init = udplite6_proc_init_net,
 	.exit = udplite6_proc_exit_net,
+	.async = true,
 };
 
 int __init udplite6_proc_init(void)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2f685ee1f9c8..a6a435d7c8f4 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1765,6 +1765,7 @@ static void __net_exit xt_net_exit(struct net *net)
 static struct pernet_operations xt_net_ops = {
 	.init = xt_net_init,
 	.exit = xt_net_exit,
+	.async = true,
 };
 
 static int __init xt_init(void)
-- 
cgit v1.2.3


From da349fad8045cc44cd9b1752b0e2d943df62cabf Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 11:50:54 +0300
Subject: net: Convert iptable_filter_net_ops

These pernet_operations register and unregister
net::ipv4.iptable_filter table. Since there are
no packets in-flight at the time of exit method
is working, iptables rules should not be touched.
Also, pernet_operations should not send ipv4
packets each other. So, it's safe to mark them
async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/iptable_filter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 9ac92ea7b93c..c1c136a93911 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -87,6 +87,7 @@ static void __net_exit iptable_filter_net_exit(struct net *net)
 static struct pernet_operations iptable_filter_net_ops = {
 	.init = iptable_filter_net_init,
 	.exit = iptable_filter_net_exit,
+	.async = true,
 };
 
 static int __init iptable_filter_init(void)
-- 
cgit v1.2.3


From 96c252bf1c5c6d7e2dac3dea42f3f0a9c939d20e Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Mon, 19 Feb 2018 12:48:21 +0100
Subject: tipc: fix bug on error path in tipc_topsrv_kern_subscr()

In commit cc1ea9ffadf7 ("tipc: eliminate struct tipc_subscriber") we
re-introduced an old bug on the error path in the function
tipc_topsrv_kern_subscr(). We now re-introduce the correction too.

Reported-by: syzbot+f62e0f2a0ef578703946@syzkaller.appspotmail.com
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/topsrv.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 02013e00f287..25925be1cc08 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -580,9 +580,10 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
 	*conid = con->conid;
 	con->sock = NULL;
 	rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub);
-	if (rc < 0)
-		tipc_conn_close(con);
-	return !rc;
+	if (rc >= 0)
+		return true;
+	conn_put(con);
+	return false;
 }
 
 void tipc_topsrv_kern_unsubscr(struct net *net, int conid)
-- 
cgit v1.2.3


From 26736a08ee0fb89a4f09bfb2c9f0805028ff63aa Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Mon, 19 Feb 2018 19:02:24 +0100
Subject: tipc: don't call sock_release() in atomic context

syzbot reported a scheduling while atomic issue at netns
destruction time:

BUG: sleeping function called from invalid context at net/core/sock.c:2769
in_atomic(): 1, irqs_disabled(): 0, pid: 85, name: kworker/u4:3
5 locks held by kworker/u4:3/85:
  #0:  ((wq_completion)"%s""netns"){+.+.}, at: [<00000000c9792deb>]
process_one_work+0xaaf/0x1af0 kernel/workqueue.c:2084
  #1:  (net_cleanup_work){+.+.}, at: [<00000000adc12e2a>]
process_one_work+0xb01/0x1af0 kernel/workqueue.c:2088
  #2:  (net_sem){++++}, at: [<000000009ccb5669>] cleanup_net+0x23f/0xd20
net/core/net_namespace.c:494
  #3:  (net_mutex){+.+.}, at: [<00000000a92767d9>] cleanup_net+0xa7d/0xd20
net/core/net_namespace.c:496
  #4:  (&(&srv->idr_lock)->rlock){+...}, at: [<000000001343e568>]
spin_lock_bh include/linux/spinlock.h:315 [inline]
  #4:  (&(&srv->idr_lock)->rlock){+...}, at: [<000000001343e568>]
tipc_topsrv_stop+0x231/0x610 net/tipc/topsrv.c:685
CPU: 0 PID: 85 Comm: kworker/u4:3 Not tainted 4.16.0-rc1+ #230
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Workqueue: netns cleanup_net
Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:53
  ___might_sleep+0x2b2/0x470 kernel/sched/core.c:6128
  __might_sleep+0x95/0x190 kernel/sched/core.c:6081
  lock_sock_nested+0x37/0x110 net/core/sock.c:2769
  lock_sock include/net/sock.h:1463 [inline]
  tipc_release+0x103/0xff0 net/tipc/socket.c:572
  sock_release+0x8d/0x1e0 net/socket.c:594
  tipc_topsrv_stop+0x3c0/0x610 net/tipc/topsrv.c:696
  tipc_exit_net+0x15/0x40 net/tipc/core.c:96
  ops_exit_list.isra.6+0xae/0x150 net/core/net_namespace.c:148
  cleanup_net+0x6ba/0xd20 net/core/net_namespace.c:529
  process_one_work+0xbbf/0x1af0 kernel/workqueue.c:2113
  worker_thread+0x223/0x1990 kernel/workqueue.c:2247
  kthread+0x33c/0x400 kernel/kthread.c:238
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:429

This is caused by tipc_topsrv_stop() releasing the listener socket
with the idr lock held. This changeset addresses the issue moving
the release operation outside such lock.

Reported-and-tested-by: syzbot+749d9d87c294c00ca856@syzkaller.appspotmail.com
Fixes: 0ef897be12b8 ("tipc: separate topology server listener socket from subcsriber sockets")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by:  ///jon
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/topsrv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 25925be1cc08..c8e34ef22c30 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -694,9 +694,9 @@ void tipc_topsrv_stop(struct net *net)
 	}
 	__module_get(lsock->ops->owner);
 	__module_get(lsock->sk->sk_prot_creator->owner);
-	sock_release(lsock);
 	srv->listener = NULL;
 	spin_unlock_bh(&srv->idr_lock);
+	sock_release(lsock);
 	tipc_topsrv_work_stop(srv);
 	idr_destroy(&srv->conn_idr);
 	kfree(srv);
-- 
cgit v1.2.3


From 19efbd93e6fb05eab81856b4fc8d64211dd37088 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 12:58:38 +0300
Subject: net: Kill net_mutex

We take net_mutex, when there are !async pernet_operations
registered, and read locking of net_sem is not enough. But
we may get rid of taking the mutex, and just change the logic
to write lock net_sem in such cases. This obviously reduces
the number of lock operations, we do.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 53 +++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index bcab9a938d6f..e89a516620dd 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -29,8 +29,6 @@
 
 static LIST_HEAD(pernet_list);
 static struct list_head *first_device = &pernet_list;
-/* Used only if there are !async pernet_operations registered */
-DEFINE_MUTEX(net_mutex);
 
 LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -407,6 +405,7 @@ struct net *copy_net_ns(unsigned long flags,
 {
 	struct ucounts *ucounts;
 	struct net *net;
+	unsigned write;
 	int rv;
 
 	if (!(flags & CLONE_NEWNET))
@@ -424,20 +423,26 @@ struct net *copy_net_ns(unsigned long flags,
 	refcount_set(&net->passive, 1);
 	net->ucounts = ucounts;
 	get_user_ns(user_ns);
-
-	rv = down_read_killable(&net_sem);
+again:
+	write = READ_ONCE(nr_sync_pernet_ops);
+	if (write)
+		rv = down_write_killable(&net_sem);
+	else
+		rv = down_read_killable(&net_sem);
 	if (rv < 0)
 		goto put_userns;
-	if (nr_sync_pernet_ops) {
-		rv = mutex_lock_killable(&net_mutex);
-		if (rv < 0)
-			goto up_read;
+
+	if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
+		up_read(&net_sem);
+		goto again;
 	}
 	rv = setup_net(net, user_ns);
-	if (nr_sync_pernet_ops)
-		mutex_unlock(&net_mutex);
-up_read:
-	up_read(&net_sem);
+
+	if (write)
+		up_write(&net_sem);
+	else
+		up_read(&net_sem);
+
 	if (rv < 0) {
 put_userns:
 		put_user_ns(user_ns);
@@ -485,15 +490,23 @@ static void cleanup_net(struct work_struct *work)
 	struct net *net, *tmp, *last;
 	struct list_head net_kill_list;
 	LIST_HEAD(net_exit_list);
+	unsigned write;
 
 	/* Atomically snapshot the list of namespaces to cleanup */
 	spin_lock_irq(&cleanup_list_lock);
 	list_replace_init(&cleanup_list, &net_kill_list);
 	spin_unlock_irq(&cleanup_list_lock);
+again:
+	write = READ_ONCE(nr_sync_pernet_ops);
+	if (write)
+		down_write(&net_sem);
+	else
+		down_read(&net_sem);
 
-	down_read(&net_sem);
-	if (nr_sync_pernet_ops)
-		mutex_lock(&net_mutex);
+	if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
+		up_read(&net_sem);
+		goto again;
+	}
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
@@ -528,14 +541,14 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_exit_list(ops, &net_exit_list);
 
-	if (nr_sync_pernet_ops)
-		mutex_unlock(&net_mutex);
-
 	/* Free the net generic variables */
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
 
-	up_read(&net_sem);
+	if (write)
+		up_write(&net_sem);
+	else
+		up_read(&net_sem);
 
 	/* Ensure there are no outstanding rcu callbacks using this
 	 * network namespace.
@@ -563,8 +576,6 @@ static void cleanup_net(struct work_struct *work)
 void net_ns_barrier(void)
 {
 	down_write(&net_sem);
-	mutex_lock(&net_mutex);
-	mutex_unlock(&net_mutex);
 	up_write(&net_sem);
 }
 EXPORT_SYMBOL(net_ns_barrier);
-- 
cgit v1.2.3


From 65b7b5b90fcd17b25ef43b0cd02bda47bf286675 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 12:58:45 +0300
Subject: net: Make cleanup_list and net::cleanup_list of llist type

This simplifies cleanup queueing and makes cleanup lists
to use llist primitives. Since llist has its own cmpxchg()
ordering, cleanup_list_lock is not more need.

Also, struct llist_node is smaller, than struct list_head,
so we save some bytes in struct net with this patch.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e89a516620dd..abf8a46e94e2 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -481,21 +481,18 @@ static void unhash_nsid(struct net *net, struct net *last)
 	spin_unlock_bh(&net->nsid_lock);
 }
 
-static DEFINE_SPINLOCK(cleanup_list_lock);
-static LIST_HEAD(cleanup_list);  /* Must hold cleanup_list_lock to touch */
+static LLIST_HEAD(cleanup_list);
 
 static void cleanup_net(struct work_struct *work)
 {
 	const struct pernet_operations *ops;
 	struct net *net, *tmp, *last;
-	struct list_head net_kill_list;
+	struct llist_node *net_kill_list;
 	LIST_HEAD(net_exit_list);
 	unsigned write;
 
 	/* Atomically snapshot the list of namespaces to cleanup */
-	spin_lock_irq(&cleanup_list_lock);
-	list_replace_init(&cleanup_list, &net_kill_list);
-	spin_unlock_irq(&cleanup_list_lock);
+	net_kill_list = llist_del_all(&cleanup_list);
 again:
 	write = READ_ONCE(nr_sync_pernet_ops);
 	if (write)
@@ -510,7 +507,7 @@ again:
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
-	list_for_each_entry(net, &net_kill_list, cleanup_list)
+	llist_for_each_entry(net, net_kill_list, cleanup_list)
 		list_del_rcu(&net->list);
 	/* Cache last net. After we unlock rtnl, no one new net
 	 * added to net_namespace_list can assign nsid pointer
@@ -525,7 +522,7 @@ again:
 	last = list_last_entry(&net_namespace_list, struct net, list);
 	rtnl_unlock();
 
-	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+	llist_for_each_entry(net, net_kill_list, cleanup_list) {
 		unhash_nsid(net, last);
 		list_add_tail(&net->exit_list, &net_exit_list);
 	}
@@ -585,12 +582,7 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net);
 void __put_net(struct net *net)
 {
 	/* Cleanup the network namespace in process context */
-	unsigned long flags;
-
-	spin_lock_irqsave(&cleanup_list_lock, flags);
-	list_add(&net->cleanup_list, &cleanup_list);
-	spin_unlock_irqrestore(&cleanup_list_lock, flags);
-
+	llist_add(&net->cleanup_list, &cleanup_list);
 	queue_work(netns_wq, &net_cleanup_work);
 }
 EXPORT_SYMBOL_GPL(__put_net);
-- 
cgit v1.2.3


From 8349efd903394422d1598d196b6be70c410cf8f5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Feb 2018 12:58:54 +0300
Subject: net: Queue net_cleanup_work only if there is first net added

When llist_add() returns false, cleanup_net() hasn't made its
llist_del_all(), while the work has already been scheduled
by the first queuer. So, we may skip queue_work() in this case.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index abf8a46e94e2..27a55236ad64 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -582,8 +582,8 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net);
 void __put_net(struct net *net)
 {
 	/* Cleanup the network namespace in process context */
-	llist_add(&net->cleanup_list, &cleanup_list);
-	queue_work(netns_wq, &net_cleanup_work);
+	if (llist_add(&net->cleanup_list, &cleanup_list))
+		queue_work(netns_wq, &net_cleanup_work);
 }
 EXPORT_SYMBOL_GPL(__put_net);
 
-- 
cgit v1.2.3


From cc944ead839a28b07cea4f90878f2b7e0cb71a02 Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Tue, 20 Feb 2018 08:44:20 +0100
Subject: devlink: Move size validation to core

Currently the size validation is done via a cb, which is unneeded. The
size validation can be moved to core. The next patch will perform cleanup.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 18d385ed8237..88e846779269 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2338,6 +2338,32 @@ out:
 	resource->size_valid = size_valid;
 }
 
+static int
+devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
+			       struct netlink_ext_ack *extack)
+{
+	u64 reminder;
+	int err = 0;
+
+	if (size > resource->size_params->size_max) {
+		NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum");
+		err = -EINVAL;
+	}
+
+	if (size < resource->size_params->size_min) {
+		NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum");
+		err = -EINVAL;
+	}
+
+	div64_u64_rem(size, resource->size_params->size_granularity, &reminder);
+	if (reminder) {
+		NL_SET_ERR_MSG_MOD(extack, "Wrong granularity");
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
 static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
 				       struct genl_info *info)
 {
@@ -2356,12 +2382,8 @@ static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
 	if (!resource)
 		return -EINVAL;
 
-	if (!resource->resource_ops->size_validate)
-		return -EINVAL;
-
 	size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
-	err = resource->resource_ops->size_validate(devlink, size,
-						    info->extack);
+	err = devlink_resource_validate_size(resource, size, info->extack);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 3fef2b6290e85de93a7096381e77f9a1b7544278 Mon Sep 17 00:00:00 2001
From: Antonio Cardace <anto.cardace@gmail.com>
Date: Mon, 19 Feb 2018 11:37:15 +0000
Subject: x25: use %*ph to print small buffer

Use %*ph format to print small buffer as hex string.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Antonio Cardace <anto.cardace@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/x25/x25_subr.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index db0b1315d577..9c214ec681ac 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -335,8 +335,7 @@ int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q,
 		}
 	}
 
-	pr_debug("invalid PLP frame %02X %02X %02X\n",
-	       frame[0], frame[1], frame[2]);
+	pr_debug("invalid PLP frame %3ph\n", frame);
 
 	return X25_ILLEGAL;
 }
-- 
cgit v1.2.3


From ccc007e4a746bb592d3e72106f00241f81d51410 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Thu, 15 Feb 2018 19:42:43 +0200
Subject: net: sched: add em_ipt ematch for calling xtables matches

The commit a new tc ematch for using netfilter xtable matches.

This allows early classification as well as mirroning/redirecting traffic
based on logic implemented in netfilter extensions.

Current supported use case is classification based on the incoming IPSec
state used during decpsulation using the 'policy' iptables extension
(xt_policy).

The module dynamically fetches the netfilter match module and calls
it using a fake xt_action_param structure based on validated userspace
provided parameters.

As the xt_policy match does not access skb->data, no skb modifications
are needed on match.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/Kconfig  |  12 +++
 net/sched/Makefile |   1 +
 net/sched/em_ipt.c | 257 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 270 insertions(+)
 create mode 100644 net/sched/em_ipt.c

(limited to 'net')

diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index f24a6ae6819a..a01169fb5325 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -658,6 +658,18 @@ config NET_EMATCH_IPSET
 	  To compile this code as a module, choose M here: the
 	  module will be called em_ipset.
 
+config NET_EMATCH_IPT
+	tristate "IPtables Matches"
+	depends on NET_EMATCH && NETFILTER && NETFILTER_XTABLES
+	---help---
+	  Say Y here to be able to classify packets based on iptables
+	  matches.
+	  Current supported match is "policy" which allows packet classification
+	  based on IPsec policy that was used during decapsulation
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_ipt.
+
 config NET_CLS_ACT
 	bool "Actions"
 	select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 5b635447e3f8..8811d3804878 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -75,3 +75,4 @@ obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
 obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
 obj-$(CONFIG_NET_EMATCH_CANID)	+= em_canid.o
 obj-$(CONFIG_NET_EMATCH_IPSET)	+= em_ipset.o
+obj-$(CONFIG_NET_EMATCH_IPT)	+= em_ipt.o
diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c
new file mode 100644
index 000000000000..a5f34e930eff
--- /dev/null
+++ b/net/sched/em_ipt.c
@@ -0,0 +1,257 @@
+/*
+ * net/sched/em_ipt.c IPtables matches Ematch
+ *
+ * (c) 2018 Eyal Birger <eyal.birger@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/tc_ematch/tc_em_ipt.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/pkt_cls.h>
+
+struct em_ipt_match {
+	const struct xt_match *match;
+	u32 hook;
+	u8 match_data[0] __aligned(8);
+};
+
+struct em_ipt_xt_match {
+	char *match_name;
+	int (*validate_match_data)(struct nlattr **tb, u8 mrev);
+};
+
+static const struct nla_policy em_ipt_policy[TCA_EM_IPT_MAX + 1] = {
+	[TCA_EM_IPT_MATCH_NAME]		= { .type = NLA_STRING,
+					    .len = XT_EXTENSION_MAXNAMELEN },
+	[TCA_EM_IPT_MATCH_REVISION]	= { .type = NLA_U8 },
+	[TCA_EM_IPT_HOOK]		= { .type = NLA_U32 },
+	[TCA_EM_IPT_NFPROTO]		= { .type = NLA_U8 },
+	[TCA_EM_IPT_MATCH_DATA]		= { .type = NLA_UNSPEC },
+};
+
+static int check_match(struct net *net, struct em_ipt_match *im, int mdata_len)
+{
+	struct xt_mtchk_param mtpar = {};
+	union {
+		struct ipt_entry e4;
+		struct ip6t_entry e6;
+	} e = {};
+
+	mtpar.net	= net;
+	mtpar.table	= "filter";
+	mtpar.hook_mask	= 1 << im->hook;
+	mtpar.family	= im->match->family;
+	mtpar.match	= im->match;
+	mtpar.entryinfo = &e;
+	mtpar.matchinfo	= (void *)im->match_data;
+	return xt_check_match(&mtpar, mdata_len, 0, 0);
+}
+
+static int policy_validate_match_data(struct nlattr **tb, u8 mrev)
+{
+	if (mrev != 0) {
+		pr_err("only policy match revision 0 supported");
+		return -EINVAL;
+	}
+
+	if (nla_get_u32(tb[TCA_EM_IPT_HOOK]) != NF_INET_PRE_ROUTING) {
+		pr_err("policy can only be matched on NF_INET_PRE_ROUTING");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct em_ipt_xt_match em_ipt_xt_matches[] = {
+	{
+		.match_name = "policy",
+		.validate_match_data = policy_validate_match_data
+	},
+	{}
+};
+
+static struct xt_match *get_xt_match(struct nlattr **tb)
+{
+	const struct em_ipt_xt_match *m;
+	struct nlattr *mname_attr;
+	u8 nfproto, mrev = 0;
+	int ret;
+
+	mname_attr = tb[TCA_EM_IPT_MATCH_NAME];
+	for (m = em_ipt_xt_matches; m->match_name; m++) {
+		if (!nla_strcmp(mname_attr, m->match_name))
+			break;
+	}
+
+	if (!m->match_name) {
+		pr_err("Unsupported xt match");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (tb[TCA_EM_IPT_MATCH_REVISION])
+		mrev = nla_get_u8(tb[TCA_EM_IPT_MATCH_REVISION]);
+
+	ret = m->validate_match_data(tb, mrev);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	nfproto = nla_get_u8(tb[TCA_EM_IPT_NFPROTO]);
+	return xt_request_find_match(nfproto, m->match_name, mrev);
+}
+
+static int em_ipt_change(struct net *net, void *data, int data_len,
+			 struct tcf_ematch *em)
+{
+	struct nlattr *tb[TCA_EM_IPT_MAX + 1];
+	struct em_ipt_match *im = NULL;
+	struct xt_match *match;
+	int mdata_len, ret;
+
+	ret = nla_parse(tb, TCA_EM_IPT_MAX, data, data_len, em_ipt_policy,
+			NULL);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[TCA_EM_IPT_HOOK] || !tb[TCA_EM_IPT_MATCH_NAME] ||
+	    !tb[TCA_EM_IPT_MATCH_DATA] || !tb[TCA_EM_IPT_NFPROTO])
+		return -EINVAL;
+
+	match = get_xt_match(tb);
+	if (IS_ERR(match)) {
+		pr_err("unable to load match\n");
+		return PTR_ERR(match);
+	}
+
+	mdata_len = XT_ALIGN(nla_len(tb[TCA_EM_IPT_MATCH_DATA]));
+	im = kzalloc(sizeof(*im) + mdata_len, GFP_KERNEL);
+	if (!im) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	im->match = match;
+	im->hook = nla_get_u32(tb[TCA_EM_IPT_HOOK]);
+	nla_memcpy(im->match_data, tb[TCA_EM_IPT_MATCH_DATA], mdata_len);
+
+	ret = check_match(net, im, mdata_len);
+	if (ret)
+		goto err;
+
+	em->datalen = sizeof(*im) + mdata_len;
+	em->data = (unsigned long)im;
+	return 0;
+
+err:
+	kfree(im);
+	module_put(match->me);
+	return ret;
+}
+
+static void em_ipt_destroy(struct tcf_ematch *em)
+{
+	struct em_ipt_match *im = (void *)em->data;
+
+	if (!im)
+		return;
+
+	if (im->match->destroy) {
+		struct xt_mtdtor_param par = {
+			.net = em->net,
+			.match = im->match,
+			.matchinfo = im->match_data,
+			.family = im->match->family
+		};
+		im->match->destroy(&par);
+	}
+	module_put(im->match->me);
+	kfree((void *)im);
+}
+
+static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em,
+			struct tcf_pkt_info *info)
+{
+	const struct em_ipt_match *im = (const void *)em->data;
+	struct xt_action_param acpar = {};
+	struct net_device *indev = NULL;
+	struct nf_hook_state state;
+	int ret;
+
+	rcu_read_lock();
+
+	if (skb->skb_iif)
+		indev = dev_get_by_index_rcu(em->net, skb->skb_iif);
+
+	nf_hook_state_init(&state, im->hook, im->match->family,
+			   indev ?: skb->dev, skb->dev, NULL, em->net, NULL);
+
+	acpar.match = im->match;
+	acpar.matchinfo = im->match_data;
+	acpar.state = &state;
+
+	ret = im->match->match(skb, &acpar);
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static int em_ipt_dump(struct sk_buff *skb, struct tcf_ematch *em)
+{
+	struct em_ipt_match *im = (void *)em->data;
+
+	if (nla_put_string(skb, TCA_EM_IPT_MATCH_NAME, im->match->name) < 0)
+		return -EMSGSIZE;
+	if (nla_put_u32(skb, TCA_EM_IPT_HOOK, im->hook) < 0)
+		return -EMSGSIZE;
+	if (nla_put_u8(skb, TCA_EM_IPT_MATCH_REVISION, im->match->revision) < 0)
+		return -EMSGSIZE;
+	if (nla_put_u8(skb, TCA_EM_IPT_NFPROTO, im->match->family) < 0)
+		return -EMSGSIZE;
+	if (nla_put(skb, TCA_EM_IPT_MATCH_DATA,
+		    im->match->usersize ?: im->match->matchsize,
+		    im->match_data) < 0)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+static struct tcf_ematch_ops em_ipt_ops = {
+	.kind	  = TCF_EM_IPT,
+	.change	  = em_ipt_change,
+	.destroy  = em_ipt_destroy,
+	.match	  = em_ipt_match,
+	.dump	  = em_ipt_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_ipt_ops.link)
+};
+
+static int __init init_em_ipt(void)
+{
+	return tcf_em_register(&em_ipt_ops);
+}
+
+static void __exit exit_em_ipt(void)
+{
+	tcf_em_unregister(&em_ipt_ops);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eyal Birger <eyal.birger@gmail.com>");
+MODULE_DESCRIPTION("TC extended match for IPtables matches");
+
+module_init(init_em_ipt);
+module_exit(exit_em_ipt);
+
+MODULE_ALIAS_TCF_EMATCH(TCF_EM_IPT);
-- 
cgit v1.2.3


From f905311356ecb9c88aceeb4fa63ee0a244fc2d72 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 19 Feb 2018 12:10:20 -0600
Subject: rds: send: mark expected switch fall-through in rds_rm_size

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Addresses-Coverity-ID: 1465362 ("Missing break in switch")
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by:  Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/send.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 028ab598ac1b..79d158b3def0 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -902,6 +902,8 @@ static int rds_rm_size(struct msghdr *msg, int num_sgs)
 
 		case RDS_CMSG_ZCOPY_COOKIE:
 			zcopy_cookie = true;
+			/* fall through */
+
 		case RDS_CMSG_RDMA_DEST:
 		case RDS_CMSG_RDMA_MAP:
 			cmsg_groups |= 2;
-- 
cgit v1.2.3


From 0a6b2a1dc2a2105f178255fe495eb914b09cb37a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Feb 2018 11:56:47 -0800
Subject: tcp: switch to GSO being always on

Oleksandr Natalenko reported performance issues with BBR without FQ
packet scheduler that were root caused to lack of SG and GSO/TSO on
his configuration.

In this mode, TCP internal pacing has to setup a high resolution timer
for each MSS sent.

We could implement in TCP a strategy similar to the one adopted
in commit fefa569a9d4b ("net_sched: sch_fq: account for schedule/timers drifts")
or decide to finally switch TCP stack to a GSO only mode.

This has many benefits :

1) Most TCP developments are done with TSO in mind.
2) Less high-resolution timers needs to be armed for TCP-pacing
3) GSO can benefit of xmit_more hint
4) Receiver GRO is more effective (as if TSO was used for real on sender)
   -> Lower ACK traffic
5) Write queues have less overhead (one skb holds about 64KB of payload)
6) SACK coalescing just works.
7) rtx rb-tree contains less packets, SACK is cheaper.

This patch implements the minimum patch, but we can remove some legacy
code as follow ups.

Tested:

On 40Gbit link, one netperf -t TCP_STREAM

BBR+fq:
sg on:  26 Gbits/sec
sg off: 15.7 Gbits/sec   (was 2.3 Gbit before patch)

BBR+pfifo_fast:
sg on:  24.2 Gbits/sec
sg off: 14.9 Gbits/sec  (was 0.66 Gbit before patch !!! )

BBR+fq_codel:
sg on:  24.4 Gbits/sec
sg off: 15 Gbits/sec  (was 0.66 Gbit before patch !!! )

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 +-
 net/ipv4/tcp.c  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index a1fa4a548f1b..507d8c6c4319 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1777,7 +1777,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 	u32 max_segs = 1;
 
 	sk_dst_set(sk, dst);
-	sk->sk_route_caps = dst->dev->features;
+	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
 	if (sk->sk_route_caps & NETIF_F_GSO)
 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 48636aee23c3..4b46a2ae46e3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,7 @@ void tcp_init_sock(struct sock *sk)
 	sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
 
 	sk_sockets_allocated_inc(sk);
+	sk->sk_route_forced_caps = NETIF_F_GSO;
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
-- 
cgit v1.2.3


From 74d4a8f8d378ddbe7caf3331804c3d5a276a9b1a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Feb 2018 11:56:48 -0800
Subject: tcp: remove sk_can_gso() use

After previous commit, sk_can_gso() is always true.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c       | 34 ++++++++--------------------------
 net/ipv4/tcp_input.c |  3 ---
 2 files changed, 8 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4b46a2ae46e3..6f35c12af85a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -898,7 +898,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
 	struct tcp_sock *tp = tcp_sk(sk);
 	u32 new_size_goal, size_goal;
 
-	if (!large_allowed || !sk_can_gso(sk))
+	if (!large_allowed)
 		return mss_now;
 
 	/* Note : tcp_tso_autosize() will eventually split this later */
@@ -1103,27 +1103,11 @@ static int linear_payload_sz(bool first_skb)
 	return 0;
 }
 
-static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc)
+static int select_size(bool first_skb, bool zc)
 {
-	const struct tcp_sock *tp = tcp_sk(sk);
-	int tmp = tp->mss_cache;
-
-	if (sg) {
-		if (zc)
-			return 0;
-
-		if (sk_can_gso(sk)) {
-			tmp = linear_payload_sz(first_skb);
-		} else {
-			int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
-
-			if (tmp >= pgbreak &&
-			    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
-				tmp = pgbreak;
-		}
-	}
-
-	return tmp;
+	if (zc)
+		return 0;
+	return linear_payload_sz(first_skb);
 }
 
 void tcp_free_fastopen_req(struct tcp_sock *tp)
@@ -1188,7 +1172,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 	int flags, err, copied = 0;
 	int mss_now = 0, size_goal, copied_syn = 0;
 	bool process_backlog = false;
-	bool sg, zc = false;
+	bool zc = false;
 	long timeo;
 
 	flags = msg->msg_flags;
@@ -1269,8 +1253,6 @@ restart:
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 		goto do_error;
 
-	sg = !!(sk->sk_route_caps & NETIF_F_SG);
-
 	while (msg_data_left(msg)) {
 		int copy = 0;
 		int max = size_goal;
@@ -1298,7 +1280,7 @@ new_segment:
 				goto restart;
 			}
 			first_skb = tcp_rtx_and_write_queues_empty(sk);
-			linear = select_size(sk, sg, first_skb, zc);
+			linear = select_size(first_skb, zc);
 			skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
 						  first_skb);
 			if (!skb)
@@ -1344,7 +1326,7 @@ new_segment:
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {
-				if (i >= sysctl_max_skb_frags || !sg) {
+				if (i >= sysctl_max_skb_frags) {
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a6b48f6253e3..06b9c4765f42 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1358,9 +1358,6 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
 	int len;
 	int in_sack;
 
-	if (!sk_can_gso(sk))
-		goto fallback;
-
 	/* Normally R but no L won't result in plain S */
 	if (!dup_sack &&
 	    (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
-- 
cgit v1.2.3


From dead7cdb0daec58490891e59f4fae0c5c76fa5f3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Feb 2018 11:56:49 -0800
Subject: tcp: remove sk_check_csum_caps()

Since TCP relies on GSO, we do not need this helper anymore.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6f35c12af85a..7c4140271887 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1063,8 +1063,7 @@ EXPORT_SYMBOL_GPL(do_tcp_sendpages);
 int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
 			size_t size, int flags)
 {
-	if (!(sk->sk_route_caps & NETIF_F_SG) ||
-	    !sk_check_csum_caps(sk))
+	if (!(sk->sk_route_caps & NETIF_F_SG))
 		return sock_no_sendpage_locked(sk, page, offset, size, flags);
 
 	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
@@ -1190,7 +1189,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 		}
 
-		zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG;
+		zc = sk->sk_route_caps & NETIF_F_SG;
 		if (!zc)
 			uarg->zerocopy = 0;
 	}
@@ -1287,11 +1286,7 @@ new_segment:
 				goto wait_for_memory;
 
 			process_backlog = true;
-			/*
-			 * Check whether we can use HW checksum.
-			 */
-			if (sk_check_csum_caps(sk))
-				skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->ip_summed = CHECKSUM_PARTIAL;
 
 			skb_entail(sk, skb);
 			copy = size_goal;
-- 
cgit v1.2.3


From 65ec60973af97dab72094490b4d2e9b8e169456c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Feb 2018 11:56:50 -0800
Subject: tcp: tcp_sendmsg() only deals with CHECKSUM_PARTIAL

We no longer have skbs with skb->ip_summed == CHECKSUM_NONE
in TCP write queues.

We can remove dead code in tcp_sendmsg().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 7c4140271887..a33539798bf6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1254,14 +1254,10 @@ restart:
 
 	while (msg_data_left(msg)) {
 		int copy = 0;
-		int max = size_goal;
 
 		skb = tcp_write_queue_tail(sk);
-		if (skb) {
-			if (skb->ip_summed == CHECKSUM_NONE)
-				max = mss_now;
-			copy = max - skb->len;
-		}
+		if (skb)
+			copy = size_goal - skb->len;
 
 		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
 			bool first_skb;
@@ -1290,7 +1286,6 @@ new_segment:
 
 			skb_entail(sk, skb);
 			copy = size_goal;
-			max = size_goal;
 
 			/* All packets are restored as if they have
 			 * already been sent. skb_mstamp isn't set to
@@ -1374,7 +1369,7 @@ new_segment:
 			goto out;
 		}
 
-		if (skb->len < max || (flags & MSG_OOB) || unlikely(tp->repair))
+		if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair))
 			continue;
 
 		if (forced_push(tp)) {
-- 
cgit v1.2.3


From 4a64fd6ccf127973d1e2b2fc2f8024e550130617 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Feb 2018 11:56:51 -0800
Subject: tcp: remove dead code from tcp_set_skb_tso_segs()

We no longer have skbs with skb->ip_summed == CHECKSUM_NONE
in TCP write queues.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b2bca373f8be..0196923aec42 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1206,7 +1206,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 /* Initialize TSO segments for a packet. */
 static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
 {
-	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
+	if (skb->len <= mss_now) {
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
 		 */
-- 
cgit v1.2.3


From 98be9b12096fb46773b4a509d3822fd17c82218e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 19 Feb 2018 11:56:52 -0800
Subject: tcp: remove dead code after CHECKSUM_PARTIAL adoption

Since all skbs in write/rtx queues have CHECKSUM_PARTIAL,
we can remove dead code.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c   | 13 +++----------
 net/ipv4/tcp_output.c | 38 +++++---------------------------------
 2 files changed, 8 insertions(+), 43 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f3e52bc98980..2c6aec2643e8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -561,16 +561,9 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 {
 	struct tcphdr *th = tcp_hdr(skb);
 
-	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
-		skb->csum_start = skb_transport_header(skb) - skb->head;
-		skb->csum_offset = offsetof(struct tcphdr, check);
-	} else {
-		th->check = tcp_v4_check(skb->len, saddr, daddr,
-					 csum_partial(th,
-						      th->doff << 2,
-						      skb->csum));
-	}
+	th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
+	skb->csum_start = skb_transport_header(skb) - skb->head;
+	skb->csum_offset = offsetof(struct tcphdr, check);
 }
 
 /* This routine computes an IPv4 TCP checksum. */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0196923aec42..8795d76f987c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1335,21 +1335,9 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
 	tcp_skb_fragment_eor(skb, buff);
 
-	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
-		/* Copy and checksum data tail into the new buffer. */
-		buff->csum = csum_partial_copy_nocheck(skb->data + len,
-						       skb_put(buff, nsize),
-						       nsize, 0);
-
-		skb_trim(skb, len);
-
-		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
-	} else {
-		skb->ip_summed = CHECKSUM_PARTIAL;
-		skb_split(skb, buff, len);
-	}
+	skb_split(skb, buff, len);
 
-	buff->ip_summed = skb->ip_summed;
+	buff->ip_summed = CHECKSUM_PARTIAL;
 
 	buff->tstamp = skb->tstamp;
 	tcp_fragment_tstamp(skb, buff);
@@ -1901,7 +1889,7 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 
 	tcp_skb_fragment_eor(skb, buff);
 
-	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
+	buff->ip_summed = CHECKSUM_PARTIAL;
 	skb_split(skb, buff, len);
 	tcp_fragment_tstamp(skb, buff);
 
@@ -2134,7 +2122,7 @@ static int tcp_mtu_probe(struct sock *sk)
 	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
 	TCP_SKB_CB(nskb)->sacked = 0;
 	nskb->csum = 0;
-	nskb->ip_summed = skb->ip_summed;
+	nskb->ip_summed = CHECKSUM_PARTIAL;
 
 	tcp_insert_write_queue_before(nskb, skb, sk);
 	tcp_highest_sack_replace(sk, skb, nskb);
@@ -2142,14 +2130,7 @@ static int tcp_mtu_probe(struct sock *sk)
 	len = 0;
 	tcp_for_write_queue_from_safe(skb, next, sk) {
 		copy = min_t(int, skb->len, probe_size - len);
-		if (nskb->ip_summed) {
-			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
-		} else {
-			__wsum csum = skb_copy_and_csum_bits(skb, 0,
-							     skb_put(nskb, copy),
-							     copy, 0);
-			nskb->csum = csum_block_add(nskb->csum, csum, len);
-		}
+		skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
 
 		if (skb->len <= copy) {
 			/* We've eaten all the data from this skb.
@@ -2166,9 +2147,6 @@ static int tcp_mtu_probe(struct sock *sk)
 						   ~(TCPHDR_FIN|TCPHDR_PSH);
 			if (!skb_shinfo(skb)->nr_frags) {
 				skb_pull(skb, copy);
-				if (skb->ip_summed != CHECKSUM_PARTIAL)
-					skb->csum = csum_partial(skb->data,
-								 skb->len, 0);
 			} else {
 				__pskb_trim_head(skb, copy);
 				tcp_set_skb_tso_segs(skb, mss_now);
@@ -2746,12 +2724,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
 	}
 	tcp_highest_sack_replace(sk, next_skb, skb);
 
-	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
-		skb->ip_summed = CHECKSUM_PARTIAL;
-
-	if (skb->ip_summed != CHECKSUM_PARTIAL)
-		skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
-
 	/* Update sequence range on original skb. */
 	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
 
-- 
cgit v1.2.3


From cac56209a66ea3b0be67aa2966b2c628b944da1e Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@cumulusnetworks.com>
Date: Tue, 20 Feb 2018 08:55:58 -0500
Subject: net: Allow a rule to track originating protocol

Allow a rule that is being added/deleted/modified or
dumped to contain the originating protocol's id.

The protocol is handled just like a routes originating
protocol is.  This is especially useful because there
is starting to be a plethora of different user space
programs adding rules.

Allow the vrf device to specify that the kernel is the originator
of the rule created for this device.

Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index cb071b8e8d17..88298f18cbae 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -51,6 +51,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 	r->pref = pref;
 	r->table = table;
 	r->flags = flags;
+	r->proto = RTPROT_KERNEL;
 	r->fr_net = ops->fro_net;
 	r->uid_range = fib_kuid_range_unset;
 
@@ -465,6 +466,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 	refcount_set(&rule->refcnt, 1);
 	rule->fr_net = net;
+	rule->proto = frh->proto;
 
 	rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
 	                              : fib_default_rule_pref(ops);
@@ -664,6 +666,9 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 
 	list_for_each_entry(rule, &ops->rules_list, list) {
+		if (frh->proto && (frh->proto != rule->proto))
+			continue;
+
 		if (frh->action && (frh->action != rule->action))
 			continue;
 
@@ -808,9 +813,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
 		goto nla_put_failure;
 	frh->res1 = 0;
-	frh->res2 = 0;
 	frh->action = rule->action;
 	frh->flags = rule->flags;
+	frh->proto = rule->proto;
 
 	if (rule->action == FR_ACT_GOTO &&
 	    rcu_access_pointer(rule->ctarget) == NULL)
-- 
cgit v1.2.3


From 7299d6f7bfd1921c0cfb5e202155f1a5cfdb57d0 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 19 Feb 2018 14:48:39 +0200
Subject: mac80211: support reporting A-MPDU EOF bit value/known

Support getting the EOF bit value reported from hardware
and writing it out to radiotap.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e755f93ad735..478a9c735edb 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -439,6 +439,10 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local,
 			flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_ERR;
 		if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
 			flags |= IEEE80211_RADIOTAP_AMPDU_DELIM_CRC_KNOWN;
+		if (status->flag & RX_FLAG_AMPDU_EOF_BIT_KNOWN)
+			flags |= IEEE80211_RADIOTAP_AMPDU_EOF_KNOWN;
+		if (status->flag & RX_FLAG_AMPDU_EOF_BIT)
+			flags |= IEEE80211_RADIOTAP_AMPDU_EOF;
 		put_unaligned_le16(flags, pos);
 		pos += 2;
 		if (status->flag & RX_FLAG_AMPDU_DELIM_CRC_KNOWN)
-- 
cgit v1.2.3


From a1f2ba04cc92414b6b933289365eab878b0b2bf4 Mon Sep 17 00:00:00 2001
From: Sara Sharon <sara.sharon@intel.com>
Date: Mon, 19 Feb 2018 14:48:40 +0200
Subject: mac80211: add get TID helper

Extracting the TID from the QOS header is common enough
to justify helper.

Signed-off-by: Sara Sharon <sara.sharon@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/iface.c               | 3 +--
 net/mac80211/michael.c             | 2 +-
 net/mac80211/mlme.c                | 2 +-
 net/mac80211/rc80211_minstrel_ht.c | 2 +-
 net/mac80211/rx.c                  | 6 ++----
 net/mac80211/tx.c                  | 9 ++-------
 net/mac80211/wpa.c                 | 8 +++-----
 7 files changed, 11 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 5fe01f82df12..d13ba064951f 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -1324,8 +1324,7 @@ static void ieee80211_iface_work(struct work_struct *work)
 			mutex_lock(&local->sta_mtx);
 			sta = sta_info_get_bss(sdata, mgmt->sa);
 			if (sta) {
-				u16 tid = *ieee80211_get_qos_ctl(hdr) &
-						IEEE80211_QOS_CTL_TID_MASK;
+				u16 tid = ieee80211_get_tid(hdr);
 
 				__ieee80211_stop_rx_ba_session(
 					sta, tid, WLAN_BACK_RECIPIENT,
diff --git a/net/mac80211/michael.c b/net/mac80211/michael.c
index 408649bd4702..37e172701a63 100644
--- a/net/mac80211/michael.c
+++ b/net/mac80211/michael.c
@@ -35,7 +35,7 @@ static void michael_mic_hdr(struct michael_mic_ctx *mctx, const u8 *key,
 	da = ieee80211_get_DA(hdr);
 	sa = ieee80211_get_SA(hdr);
 	if (ieee80211_is_data_qos(hdr->frame_control))
-		tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+		tid = ieee80211_get_tid(hdr);
 	else
 		tid = 0;
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 39b660b9a908..010b127a3937 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2151,7 +2151,7 @@ static void ieee80211_sta_tx_wmm_ac_notify(struct ieee80211_sub_if_data *sdata,
 					   u16 tx_time)
 {
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-	u16 tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	u16 tid = ieee80211_get_tid(hdr);
 	int ac = ieee80211_ac_from_tid(tid);
 	struct ieee80211_sta_tx_tspec *tx_tspec = &ifmgd->tx_tspec[ac];
 	unsigned long now = jiffies;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 4a5bdad9f303..fb586b6e5d49 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -669,7 +669,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
 	if (unlikely(skb->protocol == cpu_to_be16(ETH_P_PAE)))
 		return;
 
-	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	tid = ieee80211_get_tid(hdr);
 	if (likely(sta->ampdu_mlme.tid_tx[tid]))
 		return;
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 478a9c735edb..3dc162ddc3a6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1189,7 +1189,7 @@ static void ieee80211_rx_reorder_ampdu(struct ieee80211_rx_data *rx,
 
 	ack_policy = *ieee80211_get_qos_ctl(hdr) &
 		     IEEE80211_QOS_CTL_ACK_POLICY_MASK;
-	tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+	tid = ieee80211_get_tid(hdr);
 
 	tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
 	if (!tid_agg_rx) {
@@ -1528,9 +1528,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx)
 		   ieee80211_has_pm(hdr->frame_control) &&
 		   (ieee80211_is_data_qos(hdr->frame_control) ||
 		    ieee80211_is_qos_nullfunc(hdr->frame_control))) {
-		u8 tid;
-
-		tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+		u8 tid = ieee80211_get_tid(hdr);
 
 		ieee80211_sta_uapsd_trigger(&rx->sta->sta, tid);
 	}
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 5decd0fb247c..7643178ef132 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -797,7 +797,6 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 {
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
-	u8 *qc;
 	int tid;
 
 	/*
@@ -844,9 +843,7 @@ ieee80211_tx_h_sequence(struct ieee80211_tx_data *tx)
 		return TX_CONTINUE;
 
 	/* include per-STA, per-TID sequence counter */
-
-	qc = ieee80211_get_qos_ctl(hdr);
-	tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+	tid = ieee80211_get_tid(hdr);
 	tx->sta->tx_stats.msdu[tid]++;
 
 	hdr->seq_ctrl = ieee80211_tx_next_seq(tx->sta, tid);
@@ -1158,7 +1155,6 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_hdr *hdr;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 	int tid;
-	u8 *qc;
 
 	memset(tx, 0, sizeof(*tx));
 	tx->skb = skb;
@@ -1198,8 +1194,7 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,
 	    !ieee80211_hw_check(&local->hw, TX_AMPDU_SETUP_IN_HW)) {
 		struct tid_ampdu_tx *tid_tx;
 
-		qc = ieee80211_get_qos_ctl(hdr);
-		tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
+		tid = ieee80211_get_tid(hdr);
 
 		tid_tx = rcu_dereference(tx->sta->ampdu_mlme.tid_tx[tid]);
 		if (tid_tx) {
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 785056cb76f6..58d0b258b684 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -340,7 +340,7 @@ static void ccmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *b_0, u8 *aad)
 	a4_included = ieee80211_has_a4(hdr->frame_control);
 
 	if (ieee80211_is_data_qos(hdr->frame_control))
-		qos_tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK;
+		qos_tid = ieee80211_get_tid(hdr);
 	else
 		qos_tid = 0;
 
@@ -601,8 +601,7 @@ static void gcmp_special_blocks(struct sk_buff *skb, u8 *pn, u8 *j_0, u8 *aad)
 	aad[23] = 0;
 
 	if (ieee80211_is_data_qos(hdr->frame_control))
-		qos_tid = *ieee80211_get_qos_ctl(hdr) &
-			IEEE80211_QOS_CTL_TID_MASK;
+		qos_tid = ieee80211_get_tid(hdr);
 	else
 		qos_tid = 0;
 
@@ -867,8 +866,7 @@ ieee80211_crypto_cs_decrypt(struct ieee80211_rx_data *rx)
 		return RX_DROP_UNUSABLE;
 
 	if (ieee80211_is_data_qos(hdr->frame_control))
-		qos_tid = *ieee80211_get_qos_ctl(hdr) &
-				IEEE80211_QOS_CTL_TID_MASK;
+		qos_tid = ieee80211_get_tid(hdr);
 	else
 		qos_tid = 0;
 
-- 
cgit v1.2.3


From 94ba92713f8329c96e0a8e2880b3c1a785d1c95c Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 19 Feb 2018 14:48:41 +0200
Subject: mac80211: Call mgd_prep_tx before transmitting deauthentication

In multi channel scenarios, when disassociating from the AP before a
beacon was heard from the AP, it is not guaranteed that the virtual
interface is granted air time for the transmission of the
deauthentication frame. This in turn can lead to various issues as
the AP might never get the deauthentication frame.

To mitigate such possible issues, add a HW flag indicating that the
driver requires mac80211 to call the mgd_prep_tx() driver callback
to make sure that the virtual interface is granted immediate airtime
to be able to transmit the frame, in case that no beacon was heard
from the AP.

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/debugfs.c |  1 +
 net/mac80211/mlme.c    | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 1f466d12a6bc..a75653affbf7 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -212,6 +212,7 @@ static const char *hw_flag_names[] = {
 	FLAG(REPORTS_LOW_ACK),
 	FLAG(SUPPORTS_TX_FRAG),
 	FLAG(SUPPORTS_TDLS_BUFFER_STA),
+	FLAG(DEAUTH_NEED_MGD_TX_PREP),
 #undef FLAG
 };
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 010b127a3937..0024eff9bb84 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -7,6 +7,7 @@
  * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015 - 2017 Intel Deutschland GmbH
+ * Copyright (C) 2018        Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2008,9 +2009,22 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 		ieee80211_flush_queues(local, sdata, true);
 
 	/* deauthenticate/disassociate now */
-	if (tx || frame_buf)
+	if (tx || frame_buf) {
+		struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+		/*
+		 * In multi channel scenarios guarantee that the virtual
+		 * interface is granted immediate airtime to transmit the
+		 * deauthentication frame by calling mgd_prepare_tx, if the
+		 * driver requested so.
+		 */
+		if (ieee80211_hw_check(&local->hw, DEAUTH_NEED_MGD_TX_PREP) &&
+		    !ifmgd->have_beacon)
+			drv_mgd_prepare_tx(sdata->local, sdata);
+
 		ieee80211_send_deauth_disassoc(sdata, ifmgd->bssid, stype,
 					       reason, tx, frame_buf);
+	}
 
 	/* flush out frame - make sure the deauth was actually sent */
 	if (tx)
-- 
cgit v1.2.3


From 79a5b9727a1cceacd49921b78425ebda91836bd6 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 22 Feb 2018 13:40:27 -0800
Subject: rds: rds_msg_zcopy should return error of null
 rm->data.op_mmp_znotifier

if either or both of MSG_ZEROCOPY and SOCK_ZEROCOPY have not been
specified, the rm->data.op_mmp_znotifier allocation will be skipped.
In this case, it is invalid ot pass down a cmsghdr with
RDS_CMSG_ZCOPY_COOKIE, so return EINVAL from rds_msg_zcopy for this
case.

Reported-by: syzbot+f893ae7bb2f6456dfbc3@syzkaller.appspotmail.com
Fixes: 0cebaccef3ac ("rds: zerocopy Tx support.")
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/send.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/send.c b/net/rds/send.c
index 79d158b3def0..acad04243b41 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -941,7 +941,8 @@ static int rds_cmsg_zcopy(struct rds_sock *rs, struct rds_message *rm,
 {
 	u32 *cookie;
 
-	if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)))
+	if (cmsg->cmsg_len < CMSG_LEN(sizeof(*cookie)) ||
+	    !rm->data.op_mmp_znotifier)
 		return -EINVAL;
 	cookie = CMSG_DATA(cmsg);
 	rm->data.op_mmp_znotifier->z_cookie = *cookie;
-- 
cgit v1.2.3


From 1b71af6053af1bd2f849e9fda4f71c1e3f145dcf Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@cumulusnetworks.com>
Date: Fri, 23 Feb 2018 14:01:52 -0500
Subject: net: fib_rules: Add new attribute to set protocol

For ages iproute2 has used `struct rtmsg` as the ancillary header for
FIB rules and in the process set the protocol value to RTPROT_BOOT.
Until ca56209a66 ("net: Allow a rule to track originating protocol")
the kernel rules code ignored the protocol value sent from userspace
and always returned 0 in notifications. To avoid incompatibility with
existing iproute2, send the protocol as a new attribute.

Fixes: cac56209a66 ("net: Allow a rule to track originating protocol")
Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 88298f18cbae..a6aea805a0a2 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -466,11 +466,13 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 	refcount_set(&rule->refcnt, 1);
 	rule->fr_net = net;
-	rule->proto = frh->proto;
 
 	rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY])
 	                              : fib_default_rule_pref(ops);
 
+	rule->proto = tb[FRA_PROTOCOL] ?
+		nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
+
 	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
 
@@ -666,7 +668,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	}
 
 	list_for_each_entry(rule, &ops->rules_list, list) {
-		if (frh->proto && (frh->proto != rule->proto))
+		if (tb[FRA_PROTOCOL] &&
+		    (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
 			continue;
 
 		if (frh->action && (frh->action != rule->action))
@@ -786,7 +789,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_FWMARK */
 			 + nla_total_size(4) /* FRA_FWMASK */
 			 + nla_total_size_64bit(8) /* FRA_TUN_ID */
-			 + nla_total_size(sizeof(struct fib_kuid_range));
+			 + nla_total_size(sizeof(struct fib_kuid_range))
+			 + nla_total_size(1); /* FRA_PROTOCOL */
 
 	if (ops->nlmsg_payload)
 		payload += ops->nlmsg_payload(rule);
@@ -813,9 +817,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
 		goto nla_put_failure;
 	frh->res1 = 0;
+	frh->res2 = 0;
 	frh->action = rule->action;
 	frh->flags = rule->flags;
-	frh->proto = rule->proto;
+
+	if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto))
+		goto nla_put_failure;
 
 	if (rule->action == FR_ACT_GOTO &&
 	    rcu_access_pointer(rule->ctarget) == NULL)
-- 
cgit v1.2.3


From 7e81c55ee94338e4fe5bdd87e5fb15dbf040156f Mon Sep 17 00:00:00 2001
From: Simon Wunderlich <sw@simonwunderlich.de>
Date: Tue, 27 Feb 2018 17:56:10 +0100
Subject: batman-adv: Start new development cycle

Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index f7ba3f96d8f3..69bfedfad174 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -25,7 +25,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2018.0"
+#define BATADV_SOURCE_VERSION "2018.1"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
cgit v1.2.3


From 6b1aea8cf2c8618146edaf6b35775ab55f7cafe5 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Mon, 1 Jan 2018 00:00:00 +0100
Subject: batman-adv: Update copyright years for 2018

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/Kconfig                 | 2 +-
 net/batman-adv/Makefile                | 2 +-
 net/batman-adv/bat_algo.c              | 2 +-
 net/batman-adv/bat_algo.h              | 2 +-
 net/batman-adv/bat_iv_ogm.c            | 2 +-
 net/batman-adv/bat_iv_ogm.h            | 2 +-
 net/batman-adv/bat_v.c                 | 2 +-
 net/batman-adv/bat_v.h                 | 2 +-
 net/batman-adv/bat_v_elp.c             | 2 +-
 net/batman-adv/bat_v_elp.h             | 2 +-
 net/batman-adv/bat_v_ogm.c             | 2 +-
 net/batman-adv/bat_v_ogm.h             | 2 +-
 net/batman-adv/bitarray.c              | 2 +-
 net/batman-adv/bitarray.h              | 2 +-
 net/batman-adv/bridge_loop_avoidance.c | 2 +-
 net/batman-adv/bridge_loop_avoidance.h | 2 +-
 net/batman-adv/debugfs.c               | 2 +-
 net/batman-adv/debugfs.h               | 2 +-
 net/batman-adv/distributed-arp-table.c | 2 +-
 net/batman-adv/distributed-arp-table.h | 2 +-
 net/batman-adv/fragmentation.c         | 2 +-
 net/batman-adv/fragmentation.h         | 2 +-
 net/batman-adv/gateway_client.c        | 2 +-
 net/batman-adv/gateway_client.h        | 2 +-
 net/batman-adv/gateway_common.c        | 2 +-
 net/batman-adv/gateway_common.h        | 2 +-
 net/batman-adv/hard-interface.c        | 2 +-
 net/batman-adv/hard-interface.h        | 2 +-
 net/batman-adv/hash.c                  | 2 +-
 net/batman-adv/hash.h                  | 2 +-
 net/batman-adv/icmp_socket.c           | 2 +-
 net/batman-adv/icmp_socket.h           | 2 +-
 net/batman-adv/log.c                   | 2 +-
 net/batman-adv/log.h                   | 2 +-
 net/batman-adv/main.c                  | 2 +-
 net/batman-adv/main.h                  | 2 +-
 net/batman-adv/multicast.c             | 2 +-
 net/batman-adv/multicast.h             | 2 +-
 net/batman-adv/netlink.c               | 2 +-
 net/batman-adv/netlink.h               | 2 +-
 net/batman-adv/network-coding.c        | 2 +-
 net/batman-adv/network-coding.h        | 2 +-
 net/batman-adv/originator.c            | 2 +-
 net/batman-adv/originator.h            | 2 +-
 net/batman-adv/routing.c               | 2 +-
 net/batman-adv/routing.h               | 2 +-
 net/batman-adv/send.c                  | 2 +-
 net/batman-adv/send.h                  | 2 +-
 net/batman-adv/soft-interface.c        | 2 +-
 net/batman-adv/soft-interface.h        | 2 +-
 net/batman-adv/sysfs.c                 | 2 +-
 net/batman-adv/sysfs.h                 | 2 +-
 net/batman-adv/tp_meter.c              | 2 +-
 net/batman-adv/tp_meter.h              | 2 +-
 net/batman-adv/translation-table.c     | 2 +-
 net/batman-adv/translation-table.h     | 2 +-
 net/batman-adv/tvlv.c                  | 2 +-
 net/batman-adv/tvlv.h                  | 2 +-
 net/batman-adv/types.h                 | 2 +-
 59 files changed, 59 insertions(+), 59 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index c44f6515be5e..e4e2e02b7380 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 022f6e77307b..b97ba6fb8353 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
 #
 # Marek Lindner, Simon Wunderlich
 #
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index 80c72c7d3cad..ea309ad06175 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 029221615ba3..534b790c3753 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 79e326383726..e21aa147607b 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index 9dc0dd5c83df..317cafd302cf 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 27e165ac9302..9c3a34b65b15 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index a17ab68bbce8..ec4a2a569750 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Linus Lüssing
  *
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index a83478c46597..28687493599f 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index 5e39d0588a48..e8c7b7fd290d 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing, Marek Lindner
  *
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index ba59b77c605d..2948b41b06d4 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index 6a4c14ccc3c6..ed36c5e79fde 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index bdc1ef06e05b..a296a4d851f5 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index ca9d0753dd6b..48f683289531 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index fad47853ad3c..8ff81346ff0c 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index b27571abcd2f..71f95a3e4d3f 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich
  *
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 21d1189957a7..4229b01ac7b5 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 90a08d35c501..37b069698b04 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 9703c791ffc5..19b15de455ab 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 12897eb46268..e24aa9601c52 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2018  B.A.T.M.A.N. contributors:
  *
  * Antonio Quartulli
  *
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 22dde42fd80e..d815acc13c35 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 138b22a1836a..944512e07782 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2018  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll <martin@hundeboll.net>
  *
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 37fe9a644f22..c294f6fd43e0 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index 981f58421a32..f0b86fcb2493 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index b3e156af2256..936c107f3199 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index afebd9c7edf4..80afb2793687 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 5f186bff284a..fd4a263dd6b7 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index de5e9a374ece..d1c0f6189301 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 04d964358c98..7b49e4001778 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 4ce1b6d3ad5c..9490a7ca2ba6 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2018  B.A.T.M.A.N. contributors:
  *
  * Simon Wunderlich, Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index e91f29c7c638..7d5e9abb7a65 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index 84cddd01eeab..958be22beda9 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index dc9fa37ddd14..52d8a4b848c0 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 35e02b2b9e72..35f4f397ed57 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d31c8266e244..69c0d85bceb3 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 69bfedfad174..d5d65999207e 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index cbdeb47ec3f6..6eaffe50335a 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2018  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 3ac06337ab71..6b8594e23da3 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2014-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2018  B.A.T.M.A.N. contributors:
  *
  * Linus Lüssing
  *
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index a823d3899bad..129af56b944d 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2018  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 0e7e57b69b54..571d9a5ae7aa 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2016-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2016-2018  B.A.T.M.A.N. contributors:
  *
  * Matthias Schiffer
  *
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index b48116bb24ef..c3578444f3cb 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index adaeafa4f71e..65c346812bc1 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
  *
  * Martin Hundebøll, Jeppe Ledet-Pedersen
  *
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 58a7d9274435..2a51a0cbb82a 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 8e543a3cdc6c..f3601ab0872e 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index b6891e8b741c..289df027ecdd 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index a1289bc5f115..db54c2d9b8bf 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 2a5ab6f1076d..4a35f5c2f52b 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 1e8c79093623..64cce07b8fe6 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 900c5ce21cd4..c95e2b2677fd 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 075c5b5b2ce1..daf87f07fadd 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index c1578fa0b952..f2eef43bd2ec 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index bbeee61221fa..c1e3fb69952d 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner
  *
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 8b576712d0c1..11520de96ccb 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index c8b8f2cb2c2b..68e600974759 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2018  B.A.T.M.A.N. contributors:
  *
  * Edo Monticelli, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 7550a9ccd695..0225616d5771 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 8d9e3abec2c8..01b6c8eafaf9 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich, Antonio Quartulli
  *
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 5ffcb45ac6ff..a637458205d1 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index a74df33f446d..ef5867f49824 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index bb1578410e0c..4a3b8837e1b5 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2017  B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2018  B.A.T.M.A.N. contributors:
  *
  * Marek Lindner, Simon Wunderlich
  *
-- 
cgit v1.2.3


From 08009a760213cf6125af9453a51203f4ae108ba1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 24 Feb 2018 21:20:33 +0300
Subject: net: make kmem caches as __ro_after_init

All kmem caches aren't reallocated once set up.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 +-
 net/core/skbuff.c        | 4 ++--
 net/ipv4/fib_trie.c      | 5 +++--
 net/ipv4/inetpeer.c      | 3 ++-
 net/ipv4/ipmr.c          | 3 ++-
 net/socket.c             | 2 +-
 6 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 27a55236ad64..690e78c6af45 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -362,7 +362,7 @@ static void dec_net_namespaces(struct ucounts *ucounts)
 	dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
 }
 
-static struct kmem_cache *net_cachep;
+static struct kmem_cache *net_cachep __ro_after_init;
 static struct workqueue_struct *netns_wq;
 
 static struct net *net_alloc(void)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1a7485a2cdfa..96d36b81a3a5 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -77,8 +77,8 @@
 #include <linux/capability.h>
 #include <linux/user_namespace.h>
 
-struct kmem_cache *skbuff_head_cache __read_mostly;
-static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+struct kmem_cache *skbuff_head_cache __ro_after_init;
+static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
 int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
 EXPORT_SYMBOL(sysctl_max_skb_frags);
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5530cd6fdbc7..62243a8abf92 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -50,6 +50,7 @@
 
 #define VERSION "0.409"
 
+#include <linux/cache.h>
 #include <linux/uaccess.h>
 #include <linux/bitops.h>
 #include <linux/types.h>
@@ -191,8 +192,8 @@ static size_t tnode_free_size;
  */
 static const int sync_pages = 128;
 
-static struct kmem_cache *fn_alias_kmem __read_mostly;
-static struct kmem_cache *trie_leaf_kmem __read_mostly;
+static struct kmem_cache *fn_alias_kmem __ro_after_init;
+static struct kmem_cache *trie_leaf_kmem __ro_after_init;
 
 static inline struct tnode *tn_info(struct key_vector *kv)
 {
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 914d56928578..1f04bd91fc2e 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -6,6 +6,7 @@
  *  Authors:	Andrey V. Savochkin <saw@msu.ru>
  */
 
+#include <linux/cache.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/slab.h>
@@ -51,7 +52,7 @@
  *		daddr: unchangeable
  */
 
-static struct kmem_cache *peer_cachep __read_mostly;
+static struct kmem_cache *peer_cachep __ro_after_init;
 
 void inet_peer_base_init(struct inet_peer_base *bp)
 {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 7c7ac9d32e77..591d1fc80a1f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -28,6 +28,7 @@
 
 #include <linux/uaccess.h>
 #include <linux/types.h>
+#include <linux/cache.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
@@ -96,7 +97,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
  * In this case data path is free of exclusive locks at all.
  */
 
-static struct kmem_cache *mrt_cachep __read_mostly;
+static struct kmem_cache *mrt_cachep __ro_after_init;
 
 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
 static void ipmr_free_table(struct mr_table *mrt);
diff --git a/net/socket.c b/net/socket.c
index ab58e57c09ca..645d32b4872c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -233,7 +233,7 @@ static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
 	return __put_user(klen, ulen);
 }
 
-static struct kmem_cache *sock_inode_cachep __read_mostly;
+static struct kmem_cache *sock_inode_cachep __ro_after_init;
 
 static struct inode *sock_alloc_inode(struct super_block *sb)
 {
-- 
cgit v1.2.3


From f8c3d0dda4b09e05ad8781764cbe153815c1bf23 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 24 Feb 2018 21:21:38 +0300
Subject: xfrm: mark kmem_caches as __ro_after_init

Kmem caches aren't relocated once set up.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_input.c  | 3 ++-
 net/xfrm/xfrm_policy.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 1472c0857975..44fc54dc013c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/bottom_half.h>
+#include <linux/cache.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -31,7 +32,7 @@ struct xfrm_trans_cb {
 
 #define XFRM_TRANS_SKB_CB(__skb) ((struct xfrm_trans_cb *)&((__skb)->cb[0]))
 
-static struct kmem_cache *secpath_cachep __read_mostly;
+static struct kmem_cache *secpath_cachep __ro_after_init;
 
 static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
 static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[AF_INET6 + 1];
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078132cf..12bd415d349e 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -51,7 +51,7 @@ static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
 						__read_mostly;
 
-static struct kmem_cache *xfrm_dst_cache __read_mostly;
+static struct kmem_cache *xfrm_dst_cache __ro_after_init;
 static __read_mostly seqcount_t xfrm_policy_hash_generation;
 
 static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr);
-- 
cgit v1.2.3


From 5211fcfb8110f77ff9f389e476563345817f61a5 Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Mon, 26 Feb 2018 14:28:19 -0800
Subject: esp: check the NETIF_F_HW_ESP_TX_CSUM bit before segmenting

If I understand correctly, we should not be asking for a
checksum offload on an ipsec packet if the netdev isn't
advertising NETIF_F_HW_ESP_TX_CSUM.  In that case, we should
clear the NETIF_F_CSUM_MASK bits.

Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4_offload.c | 2 ++
 net/ipv6/esp6_offload.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index da5635fc52c2..7cf755ef9efb 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -138,6 +138,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
 	if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
 	    (x->xso.dev != skb->dev))
 		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+	else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
+		esp_features = features & ~NETIF_F_CSUM_MASK;
 
 	xo->flags |= XFRM_GSO_SEGMENT;
 
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 3fd1ec775dc2..27f59b61f70f 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -165,6 +165,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
 	if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
 	    (x->xso.dev != skb->dev))
 		esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+	else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
+		esp_features = features & ~NETIF_F_CSUM_MASK;
 
 	xo->flags |= XFRM_GSO_SEGMENT;
 
-- 
cgit v1.2.3


From 59cae5b9d6d39db2804ed5f0330eb5507d40c34c Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 23 Feb 2018 10:06:04 +0100
Subject: mac80211: support AP 4-addr mode fast-rx

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3dc162ddc3a6..89dcf9f762ce 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3783,6 +3783,15 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 			!(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) &&
 			(sdata->vif.type != NL80211_IFTYPE_AP_VLAN ||
 			 !sdata->u.vlan.sta);
+
+		if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
+		    sdata->u.vlan.sta) {
+			fastrx.expected_ds_bits |=
+				cpu_to_le16(IEEE80211_FCTL_FROMDS);
+			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4);
+			fastrx.internal_forward = 0;
+		}
+
 		break;
 	default:
 		goto clear;
-- 
cgit v1.2.3


From 1d870162418a826905161d2276c912986d3b9d9a Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 23 Feb 2018 10:06:05 +0100
Subject: mac80211: support fast-rx with incompatible PS capabilities when PS
 is disabled

When powersave is disabled for the interface, we can do fast-rx anyway.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
[fixed indentation on one line]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c |  1 +
 net/mac80211/rx.c  | 17 +++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f4195a0f0279..fd68f6fb02d7 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2685,6 +2685,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
 
 	ieee80211_recalc_ps(local);
 	ieee80211_recalc_ps_vif(sdata);
+	ieee80211_check_fast_rx_iface(sdata);
 
 	return 0;
 }
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 89dcf9f762ce..1d417960d376 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3750,12 +3750,7 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 		/* 4-addr is harder to deal with, later maybe */
 		if (sdata->u.mgd.use_4addr)
 			goto clear;
-		/* software powersave is a huge mess, avoid all of it */
-		if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
-			goto clear;
-		if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) &&
-		    !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
-			goto clear;
+
 		if (sta->sta.tdls) {
 			fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
 			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
@@ -3767,6 +3762,16 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 			fastrx.expected_ds_bits =
 				cpu_to_le16(IEEE80211_FCTL_FROMDS);
 		}
+
+		if (!sdata->u.mgd.powersave)
+			break;
+
+		/* software powersave is a huge mess, avoid all of it */
+		if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK))
+			goto clear;
+		if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) &&
+		    !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
+			goto clear;
 		break;
 	case NL80211_IFTYPE_AP_VLAN:
 	case NL80211_IFTYPE_AP:
-- 
cgit v1.2.3


From 9251a4736ee360d3850644ddaaa1a61dcec96238 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 23 Feb 2018 13:55:50 +0100
Subject: mac80211: support station 4-addr mode fast-rx

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1d417960d376..2783c5cd7de7 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3747,10 +3747,6 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_STATION:
-		/* 4-addr is harder to deal with, later maybe */
-		if (sdata->u.mgd.use_4addr)
-			goto clear;
-
 		if (sta->sta.tdls) {
 			fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1);
 			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2);
@@ -3763,6 +3759,13 @@ void ieee80211_check_fast_rx(struct sta_info *sta)
 				cpu_to_le16(IEEE80211_FCTL_FROMDS);
 		}
 
+		if (sdata->u.mgd.use_4addr && !sta->sta.tdls) {
+			fastrx.expected_ds_bits |=
+				cpu_to_le16(IEEE80211_FCTL_TODS);
+			fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3);
+			fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr4);
+		}
+
 		if (!sdata->u.mgd.powersave)
 			break;
 
-- 
cgit v1.2.3


From 21b7022f13fb038b3e204a892c7cc42749754f7f Mon Sep 17 00:00:00 2001
From: Ilan Peer <ilan.peer@intel.com>
Date: Mon, 19 Feb 2018 14:48:44 +0200
Subject: mac80211: agg-rx: Accept ADDBA request update if timeout did not
 change

As there is no support for updating an existing ADDBA session with
a peer, we decline the request (while keeping the session active).
However, in case that the timeout did not change, there is no need
to decline the request, so modify the code to reply with status success
in such a case (this is useful for interoperability with APs that send an
ADDBA request update without changing the timeout value).

Signed-off-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/agg-rx.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 1f3188d03840..e83c19d4c292 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -298,13 +298,23 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
 
 	if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
 		if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
+			struct tid_ampdu_rx *tid_rx;
+
 			ht_dbg_ratelimited(sta->sdata,
 					   "updated AddBA Req from %pM on tid %u\n",
 					   sta->sta.addr, tid);
 			/* We have no API to update the timeout value in the
-			 * driver so reject the timeout update.
+			 * driver so reject the timeout update if the timeout
+			 * changed. If if did not change, i.e., no real update,
+			 * just reply with success.
 			 */
-			status = WLAN_STATUS_REQUEST_DECLINED;
+			rcu_read_lock();
+			tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+			if (tid_rx && tid_rx->timeout == timeout)
+				status = WLAN_STATUS_SUCCESS;
+			else
+				status = WLAN_STATUS_REQUEST_DECLINED;
+			rcu_read_unlock();
 			goto end;
 		}
 
-- 
cgit v1.2.3


From 84d0a394808621619a4b319c6eed16e3b389e214 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Fri, 5 Jan 2018 09:54:32 +0100
Subject: batman-adv: Fix indentation of batadv_seq_before

Also multiline macros should have their statements start on a tabstop. This
was detected by checkpatch.pl after commit a134f8de9f40 ("checkpatch:
improve the TABSTOP test to include declarations").

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/main.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index d5d65999207e..057a28a9fe88 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -331,11 +331,13 @@ static inline bool batadv_has_timed_out(unsigned long timestamp,
  *
  * Return: true when x is a predecessor of y, false otherwise
  */
-#define batadv_seq_before(x, y) ({typeof(x)_d1 = (x); \
-				 typeof(y)_d2 = (y); \
-				 typeof(x)_dummy = (_d1 - _d2); \
-				 (void)(&_d1 == &_d2); \
-				 _dummy > batadv_smallest_signed_int(_dummy); })
+#define batadv_seq_before(x, y) ({ \
+	typeof(x)_d1 = (x); \
+	typeof(y)_d2 = (y); \
+	typeof(x)_dummy = (_d1 - _d2); \
+	(void)(&_d1 == &_d2); \
+	_dummy > batadv_smallest_signed_int(_dummy); \
+})
 
 /**
  * batadv_seq_after() - Checks if a sequence number x is a successor of y
-- 
cgit v1.2.3


From d7625f9f72dc148b4f25d9bc5014b710e1024b15 Mon Sep 17 00:00:00 2001
From: Sven Eckelmann <sven@narfation.org>
Date: Tue, 20 Feb 2018 12:08:10 +0100
Subject: batman-adv: Avoid relation operator comparison with bool

commit 785ea1144182 ("batman-adv: Distributed ARP Table - create DHT helper
functions") introduced a return check of batadv_compare_eth which uses a
boolean return value since commit 16af73458aca ("batman-adv: main,
batadv_compare_eth return bool"). A relational (<, >, <= or >=) operator
is not the right one for such a check.

Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Acked-by: Antonio Quartulli <a@unstable.cc>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/distributed-arp-table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 19b15de455ab..4469dcc1558f 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -495,7 +495,7 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
 	 * the one with the lowest address
 	 */
 	if (tmp_max == max && max_orig_node &&
-	    batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0)
+	    batadv_compare_eth(candidate->orig, max_orig_node->orig))
 		goto out;
 
 	ret = true;
-- 
cgit v1.2.3


From 24bba078eca099b5bd25e17e97b485f013589f8c Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 27 Feb 2018 13:03:07 +0100
Subject: mac80211: support A-MSDU in fast-rx

Only works if the IV was stripped from packets. Create a smaller
variant of ieee80211_rx_h_amsdu, which bypasses checks already done
within the fast-rx context.

In order to do so, update cfg80211's ieee80211_data_to_8023_exthdr()
to take the offset between header and snap.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/rx.c   | 124 +++++++++++++++++++++++++++++++---------------------
 net/wireless/util.c |   5 ++-
 2 files changed, 76 insertions(+), 53 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 2783c5cd7de7..de7d10732fd5 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2353,39 +2353,17 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 }
 
 static ieee80211_rx_result debug_noinline
-ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
+__ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset)
 {
 	struct net_device *dev = rx->sdata->dev;
 	struct sk_buff *skb = rx->skb;
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	__le16 fc = hdr->frame_control;
 	struct sk_buff_head frame_list;
-	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
 	struct ethhdr ethhdr;
 	const u8 *check_da = ethhdr.h_dest, *check_sa = ethhdr.h_source;
 
-	if (unlikely(!ieee80211_is_data(fc)))
-		return RX_CONTINUE;
-
-	if (unlikely(!ieee80211_is_data_present(fc)))
-		return RX_DROP_MONITOR;
-
-	if (!(status->rx_flags & IEEE80211_RX_AMSDU))
-		return RX_CONTINUE;
-
 	if (unlikely(ieee80211_has_a4(hdr->frame_control))) {
-		switch (rx->sdata->vif.type) {
-		case NL80211_IFTYPE_AP_VLAN:
-			if (!rx->sdata->u.vlan.sta)
-				return RX_DROP_UNUSABLE;
-			break;
-		case NL80211_IFTYPE_STATION:
-			if (!rx->sdata->u.mgd.use_4addr)
-				return RX_DROP_UNUSABLE;
-			break;
-		default:
-			return RX_DROP_UNUSABLE;
-		}
 		check_da = NULL;
 		check_sa = NULL;
 	} else switch (rx->sdata->vif.type) {
@@ -2405,15 +2383,13 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 			break;
 	}
 
-	if (is_multicast_ether_addr(hdr->addr1))
-		return RX_DROP_UNUSABLE;
-
 	skb->dev = dev;
 	__skb_queue_head_init(&frame_list);
 
 	if (ieee80211_data_to_8023_exthdr(skb, &ethhdr,
 					  rx->sdata->vif.addr,
-					  rx->sdata->vif.type))
+					  rx->sdata->vif.type,
+					  data_offset))
 		return RX_DROP_UNUSABLE;
 
 	ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
@@ -2435,6 +2411,44 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
 	return RX_QUEUED;
 }
 
+static ieee80211_rx_result debug_noinline
+ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+	__le16 fc = hdr->frame_control;
+
+	if (!(status->rx_flags & IEEE80211_RX_AMSDU))
+		return RX_CONTINUE;
+
+	if (unlikely(!ieee80211_is_data(fc)))
+		return RX_CONTINUE;
+
+	if (unlikely(!ieee80211_is_data_present(fc)))
+		return RX_DROP_MONITOR;
+
+	if (unlikely(ieee80211_has_a4(hdr->frame_control))) {
+		switch (rx->sdata->vif.type) {
+		case NL80211_IFTYPE_AP_VLAN:
+			if (!rx->sdata->u.vlan.sta)
+				return RX_DROP_UNUSABLE;
+			break;
+		case NL80211_IFTYPE_STATION:
+			if (!rx->sdata->u.mgd.use_4addr)
+				return RX_DROP_UNUSABLE;
+			break;
+		default:
+			return RX_DROP_UNUSABLE;
+		}
+	}
+
+	if (is_multicast_ether_addr(hdr->addr1))
+		return RX_DROP_UNUSABLE;
+
+	return __ieee80211_rx_h_amsdu(rx, 0);
+}
+
 #ifdef CONFIG_MAC80211_MESH
 static ieee80211_rx_result
 ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
@@ -3898,7 +3912,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
 	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
 	struct sta_info *sta = rx->sta;
 	int orig_len = skb->len;
-	int snap_offs = ieee80211_hdrlen(hdr->frame_control);
+	int hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	int snap_offs = hdrlen;
 	struct {
 		u8 snap[sizeof(rfc1042_header)];
 		__be16 proto;
@@ -3929,10 +3944,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
 	    (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS)
 		return false;
 
-	/* we don't deal with A-MSDU deaggregation here */
-	if (status->rx_flags & IEEE80211_RX_AMSDU)
-		return false;
-
 	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
 		return false;
 
@@ -3964,21 +3975,24 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
 		snap_offs += IEEE80211_CCMP_HDR_LEN;
 	}
 
-	if (!pskb_may_pull(skb, snap_offs + sizeof(*payload)))
-		goto drop;
-	payload = (void *)(skb->data + snap_offs);
+	if (!(status->rx_flags & IEEE80211_RX_AMSDU)) {
+		if (!pskb_may_pull(skb, snap_offs + sizeof(*payload)))
+			goto drop;
 
-	if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr))
-		return false;
+		payload = (void *)(skb->data + snap_offs);
 
-	/* Don't handle these here since they require special code.
-	 * Accept AARP and IPX even though they should come with a
-	 * bridge-tunnel header - but if we get them this way then
-	 * there's little point in discarding them.
-	 */
-	if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) ||
-		     payload->proto == fast_rx->control_port_protocol))
-		return false;
+		if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr))
+			return false;
+
+		/* Don't handle these here since they require special code.
+		 * Accept AARP and IPX even though they should come with a
+		 * bridge-tunnel header - but if we get them this way then
+		 * there's little point in discarding them.
+		 */
+		if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) ||
+			     payload->proto == fast_rx->control_port_protocol))
+			return false;
+	}
 
 	/* after this point, don't punt to the slowpath! */
 
@@ -3992,12 +4006,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
 	}
 
 	/* statistics part of ieee80211_rx_h_sta_process() */
-	stats->last_rx = jiffies;
-	stats->last_rate = sta_stats_encode_rate(status);
-
-	stats->fragments++;
-	stats->packets++;
-
 	if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
 		stats->last_signal = status->signal;
 		if (!fast_rx->uses_rss)
@@ -4026,6 +4034,20 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx,
 	if (rx->key && !ieee80211_has_protected(hdr->frame_control))
 		goto drop;
 
+	if (status->rx_flags & IEEE80211_RX_AMSDU) {
+		if (__ieee80211_rx_h_amsdu(rx, snap_offs - hdrlen) !=
+		    RX_QUEUED)
+			goto drop;
+
+		return true;
+	}
+
+	stats->last_rx = jiffies;
+	stats->last_rate = sta_stats_encode_rate(status);
+
+	stats->fragments++;
+	stats->packets++;
+
 	/* do the header conversion - first grab the addresses */
 	ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs);
 	ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs);
diff --git a/net/wireless/util.c b/net/wireless/util.c
index c69160694b6c..d112e9a89364 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -420,7 +420,8 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
 EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen);
 
 int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
-				  const u8 *addr, enum nl80211_iftype iftype)
+				  const u8 *addr, enum nl80211_iftype iftype,
+				  u8 data_offset)
 {
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
 	struct {
@@ -434,7 +435,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr,
 	if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
 		return -1;
 
-	hdrlen = ieee80211_hdrlen(hdr->frame_control);
+	hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset;
 	if (skb->len < hdrlen + 8)
 		return -1;
 
-- 
cgit v1.2.3


From c80afa026a7f6eb01f5431760cd894526153d4a8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 15:59:19 +0300
Subject: net: Convert /proc creating and destroying pernet_operations

These pernet_operations just create and destroy /proc entries,
and they can safely marked as async:

pppoe_net_ops
vlan_net_ops
canbcm_pernet_ops
kcm_net_ops
pfkey_net_ops
pppol2tp_net_ops
phonet_net_ops

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c    | 1 +
 net/can/bcm.c       | 1 +
 net/kcm/kcmproc.c   | 1 +
 net/key/af_key.c    | 1 +
 net/l2tp/l2tp_ppp.c | 1 +
 net/phonet/pn_dev.c | 1 +
 6 files changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index bad01b14a4ad..bd0ed39f65fb 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -729,6 +729,7 @@ static struct pernet_operations vlan_net_ops = {
 	.exit = vlan_exit_net,
 	.id   = &vlan_net_id,
 	.size = sizeof(struct vlan_net),
+	.async = true,
 };
 
 static int __init vlan_proto_init(void)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index ac5e5e34fee3..26730d39e048 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1717,6 +1717,7 @@ static void canbcm_pernet_exit(struct net *net)
 static struct pernet_operations canbcm_pernet_ops __read_mostly = {
 	.init = canbcm_pernet_init,
 	.exit = canbcm_pernet_exit,
+	.async = true,
 };
 
 static int __init bcm_module_init(void)
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 9d5649e4e8b7..2c1c8b3e4452 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -433,6 +433,7 @@ static void kcm_proc_exit_net(struct net *net)
 static struct pernet_operations kcm_net_ops = {
 	.init = kcm_proc_init_net,
 	.exit = kcm_proc_exit_net,
+	.async = true,
 };
 
 int __init kcm_proc_init(void)
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 7e2e7188e7f4..3ac08ab26207 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3863,6 +3863,7 @@ static struct pernet_operations pfkey_net_ops = {
 	.exit = pfkey_net_exit,
 	.id   = &pfkey_net_id,
 	.size = sizeof(struct netns_pfkey),
+	.async = true,
 };
 
 static void __exit ipsec_pfkey_exit(void)
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 99a03c72db4f..0c4f49a6a0cb 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1770,6 +1770,7 @@ static struct pernet_operations pppol2tp_net_ops = {
 	.init = pppol2tp_init_net,
 	.exit = pppol2tp_exit_net,
 	.id   = &pppol2tp_net_id,
+	.async = true,
 };
 
 /*****************************************************************************
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index 77787512fc32..9454e8393793 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -342,6 +342,7 @@ static struct pernet_operations phonet_net_ops = {
 	.exit = phonet_exit_net,
 	.id   = &phonet_net_id,
 	.size = sizeof(struct phonet_net),
+	.async = true,
 };
 
 /* Initialize Phonet devices list */
-- 
cgit v1.2.3


From 47d63a01797be8de142beeb0090704501701eafa Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 15:59:28 +0300
Subject: net: Convert hashlimit_net_ops and recent_net_ops

These pernet_operations just create and destroy /proc entries.
Also, new /proc entries also may come after new nf rules
are added, but this is not possible, when net isn't alive.
So, they are safe to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/xt_hashlimit.c | 1 +
 net/netfilter/xt_recent.c    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 66f5aca62a08..db2fe0911740 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1345,6 +1345,7 @@ static struct pernet_operations hashlimit_net_ops = {
 	.exit	= hashlimit_net_exit,
 	.id	= &hashlimit_net_id,
 	.size	= sizeof(struct hashlimit_net),
+	.async	= true,
 };
 
 static int __init hashlimit_mt_init(void)
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 6d232d18faff..19efdb757944 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -687,6 +687,7 @@ static struct pernet_operations recent_net_ops = {
 	.exit	= recent_net_exit,
 	.id	= &recent_net_id,
 	.size	= sizeof(struct recent_net),
+	.async	= true,
 };
 
 static struct xt_match recent_mt_reg[] __read_mostly = {
-- 
cgit v1.2.3


From f0aad8e340eace8a13736277a0e8c040a879740c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 15:59:37 +0300
Subject: net: Convert synproxy_net_ops

These pernet_operations create and destroy /proc entries
and allocate extents to template ct, which depend on global
nf_ct_ext_types[] array. So, we are able to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_synproxy_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 92139a087260..64b875e452ca 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -398,6 +398,7 @@ static struct pernet_operations synproxy_net_ops = {
 	.exit		= synproxy_net_exit,
 	.id		= &synproxy_net_id,
 	.size		= sizeof(struct synproxy_net),
+	.async		= true,
 };
 
 static int __init synproxy_core_init(void)
-- 
cgit v1.2.3


From 02df428ca291f20285f04c25a7efd664a5d83551 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 15:59:56 +0300
Subject: net: Convert simple pernet_operations

These pernet_operations make pretty simple actions
like variable initialization on init, debug checks
on exit, and so on, and they obviously are able
to be executed in parallel with any others:

vrf_net_ops
lockd_net_ops
grace_net_ops
xfrm6_tunnel_net_ops
kcm_net_ops
tcf_net_ops

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/xfrm6_tunnel.c | 1 +
 net/kcm/kcmsock.c       | 1 +
 net/sched/cls_api.c     | 1 +
 3 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index f85f0d7480ac..a9673619e0e9 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -353,6 +353,7 @@ static struct pernet_operations xfrm6_tunnel_net_ops = {
 	.exit	= xfrm6_tunnel_net_exit,
 	.id	= &xfrm6_tunnel_net_id,
 	.size	= sizeof(struct xfrm6_tunnel_net),
+	.async	= true,
 };
 
 static int __init xfrm6_tunnel_init(void)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 435594648dac..a6cd0712e063 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -2015,6 +2015,7 @@ static struct pernet_operations kcm_net_ops = {
 	.exit = kcm_exit_net,
 	.id   = &kcm_net_id,
 	.size = sizeof(struct kcm_net),
+	.async = true,
 };
 
 static int __init kcm_init(void)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 9d1a8bbf8152..19f9f421d5b7 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1618,6 +1618,7 @@ static struct pernet_operations tcf_net_ops = {
 	.exit = tcf_net_exit,
 	.id   = &tcf_net_id,
 	.size = sizeof(struct tcf_net),
+	.async = true,
 };
 
 static int __init tc_filter_init(void)
-- 
cgit v1.2.3


From 5fcc85843d94e40ac1633d18d9651f69d9f16770 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:00:22 +0300
Subject: net: Convert sysctl creating and destroying pernet_operations

These pernet_operations create and destroy sysctl tables,
and they are able to be executed in parallel with any others:

ip_vs_lblc_ops
ip_vs_lblcr_ops

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/ipvs/ip_vs_lblc.c  | 1 +
 net/netfilter/ipvs/ip_vs_lblcr.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index d625179de485..6a340c94c4b8 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -604,6 +604,7 @@ static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
 static struct pernet_operations ip_vs_lblc_ops = {
 	.init = __ip_vs_lblc_init,
 	.exit = __ip_vs_lblc_exit,
+	.async = true,
 };
 
 static int __init ip_vs_lblc_init(void)
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 84c57b62a588..0627881128da 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -789,6 +789,7 @@ static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
 static struct pernet_operations ip_vs_lblcr_ops = {
 	.init = __ip_vs_lblcr_init,
 	.exit = __ip_vs_lblcr_exit,
+	.async = true,
 };
 
 static int __init ip_vs_lblcr_init(void)
-- 
cgit v1.2.3


From 685ecfb19888963f61c6085c17c254dbf665e9da Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:00:31 +0300
Subject: net: Convert tc_action_net_init() and tc_action_net_exit() based
 pernet_operations

These pernet_operations are from net/sched directory, and they call only
tc_action_net_init() and tc_action_net_exit():

bpf_net_ops
connmark_net_ops
csum_net_ops
gact_net_ops
ife_net_ops
ipt_net_ops
xt_net_ops
mirred_net_ops
nat_net_ops
pedit_net_ops
police_net_ops
sample_net_ops
simp_net_ops
skbedit_net_ops
skbmod_net_ops
tunnel_key_net_ops
vlan_net_ops

1)tc_action_net_init() just allocates and initializes per-net memory.
2)There should not be in-flight packets at the time of tc_action_net_exit()
call, or another pernet_operations send packets to dying net (except
netlink). So, it seems they can be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_bpf.c        | 1 +
 net/sched/act_connmark.c   | 1 +
 net/sched/act_csum.c       | 1 +
 net/sched/act_gact.c       | 1 +
 net/sched/act_ife.c        | 1 +
 net/sched/act_ipt.c        | 2 ++
 net/sched/act_mirred.c     | 1 +
 net/sched/act_nat.c        | 1 +
 net/sched/act_pedit.c      | 1 +
 net/sched/act_police.c     | 1 +
 net/sched/act_sample.c     | 1 +
 net/sched/act_simple.c     | 1 +
 net/sched/act_skbedit.c    | 1 +
 net/sched/act_skbmod.c     | 1 +
 net/sched/act_tunnel_key.c | 1 +
 net/sched/act_vlan.c       | 1 +
 16 files changed, 17 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index cb3c5d403c88..da72e0cf2b1f 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -413,6 +413,7 @@ static struct pernet_operations bpf_net_ops = {
 	.exit_batch = bpf_exit_net,
 	.id   = &bpf_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int __init bpf_init_module(void)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index e4b880fa51fe..371e5e4ab3e2 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -222,6 +222,7 @@ static struct pernet_operations connmark_net_ops = {
 	.exit_batch = connmark_exit_net,
 	.id   = &connmark_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int __init connmark_init_module(void)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index d5c2e528d150..1fb1f1f6a555 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -677,6 +677,7 @@ static struct pernet_operations csum_net_ops = {
 	.exit_batch = csum_exit_net,
 	.id   = &csum_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_DESCRIPTION("Checksum updating actions");
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index f072bcf33760..74563254e676 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -247,6 +247,7 @@ static struct pernet_operations gact_net_ops = {
 	.exit_batch = gact_exit_net,
 	.id   = &gact_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index a5994cf0512b..555b1caeff72 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -870,6 +870,7 @@ static struct pernet_operations ife_net_ops = {
 	.exit_batch = ife_exit_net,
 	.id   = &ife_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int __init ife_init_module(void)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 9784629090ad..10866717f88e 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -349,6 +349,7 @@ static struct pernet_operations ipt_net_ops = {
 	.exit_batch = ipt_exit_net,
 	.id   = &ipt_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
@@ -399,6 +400,7 @@ static struct pernet_operations xt_net_ops = {
 	.exit_batch = xt_exit_net,
 	.id   = &xt_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index fd34015331ab..64c86579c3d9 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -353,6 +353,7 @@ static struct pernet_operations mirred_net_ops = {
 	.exit_batch = mirred_exit_net,
 	.id   = &mirred_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002)");
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 4b5848b6c252..b1bc757f6491 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -323,6 +323,7 @@ static struct pernet_operations nat_net_ops = {
 	.exit_batch = nat_exit_net,
 	.id   = &nat_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_DESCRIPTION("Stateless NAT actions");
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 094303c27c5e..5e8cc8f63acd 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -465,6 +465,7 @@ static struct pernet_operations pedit_net_ops = {
 	.exit_batch = pedit_exit_net,
 	.id   = &pedit_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index ff55bd6c7db0..51fe4fe343f7 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -347,6 +347,7 @@ static struct pernet_operations police_net_ops = {
 	.exit_batch = police_exit_net,
 	.id   = &police_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int __init police_init_module(void)
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 9765145aaf40..238dfd27e995 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -248,6 +248,7 @@ static struct pernet_operations sample_net_ops = {
 	.exit_batch = sample_exit_net,
 	.id   = &sample_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int __init sample_init_module(void)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 8244e221fe4f..91816d73f3f3 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -216,6 +216,7 @@ static struct pernet_operations simp_net_ops = {
 	.exit_batch = simp_exit_net,
 	.id   = &simp_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2005)");
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index ddf69fc01bdf..7971510fe61b 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -253,6 +253,7 @@ static struct pernet_operations skbedit_net_ops = {
 	.exit_batch = skbedit_exit_net,
 	.id   = &skbedit_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index a406f191cb84..febec75f4f7a 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -278,6 +278,7 @@ static struct pernet_operations skbmod_net_ops = {
 	.exit_batch = skbmod_exit_net,
 	.id   = &skbmod_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 41ff9d0e5c62..9169b7e78ada 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -337,6 +337,7 @@ static struct pernet_operations tunnel_key_net_ops = {
 	.exit_batch = tunnel_key_exit_net,
 	.id   = &tunnel_key_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int __init tunnel_key_init_module(void)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 71411a255f04..c2ee7fd51cc9 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -313,6 +313,7 @@ static struct pernet_operations vlan_net_ops = {
 	.exit_batch = vlan_exit_net,
 	.id   = &vlan_net_id,
 	.size = sizeof(struct tc_action_net),
+	.async = true,
 };
 
 static int __init vlan_init_module(void)
-- 
cgit v1.2.3


From 3cec5fb3476e07833fea9a1e2d5f6c629078b4ae Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:01:43 +0300
Subject: net: Convert br_net_ops

These pernet_operations are similar to bond_net_ops. Exit method
unregisters all net bridge devices, and it looks like another
pernet_operations are not interested in foreign net bridge list.
So, it's possible to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bridge/br.c b/net/bridge/br.c
index 6bf06e756df2..7770481a6506 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -188,6 +188,7 @@ static void __net_exit br_net_exit(struct net *net)
 
 static struct pernet_operations br_net_ops = {
 	.exit	= br_net_exit,
+	.async	= true,
 };
 
 static const struct stp_proto br_stp_proto = {
-- 
cgit v1.2.3


From 31502104b301c0dcacd5df8e92fba9dcf20dfccb Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:01:52 +0300
Subject: net: Convert ipgre_net_ops, ipgre_tap_net_ops, erspan_net_ops,
 vti_net_ops and ipip_net_ops

These pernet_operations are similar to bond_net_ops. Exit methods
unregisters all net ipgre/ipgre_tap/erspan/vti/ipip devices, and it
looks like another pernet_operations are not interested in foreign
net ipgre/ipgre_tap/erspan/vti/ipip list. Init method also does not
intersect with something pernet-specific. So, it's possible
to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 3 +++
 net/ipv4/ip_vti.c | 1 +
 net/ipv4/ipip.c   | 1 +
 3 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 45d97e9b2759..e496afa47709 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1044,6 +1044,7 @@ static struct pernet_operations ipgre_net_ops = {
 	.exit_batch = ipgre_exit_batch_net,
 	.id   = &ipgre_net_id,
 	.size = sizeof(struct ip_tunnel_net),
+	.async = true,
 };
 
 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1623,6 +1624,7 @@ static struct pernet_operations ipgre_tap_net_ops = {
 	.exit_batch = ipgre_tap_exit_batch_net,
 	.id   = &gre_tap_net_id,
 	.size = sizeof(struct ip_tunnel_net),
+	.async = true,
 };
 
 static int __net_init erspan_init_net(struct net *net)
@@ -1641,6 +1643,7 @@ static struct pernet_operations erspan_net_ops = {
 	.exit_batch = erspan_exit_batch_net,
 	.id   = &erspan_net_id,
 	.size = sizeof(struct ip_tunnel_net),
+	.async = true,
 };
 
 static int __init ipgre_init(void)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 51b1669334fe..b10bf563afd9 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -454,6 +454,7 @@ static struct pernet_operations vti_net_ops = {
 	.exit_batch = vti_exit_batch_net,
 	.id   = &vti_net_id,
 	.size = sizeof(struct ip_tunnel_net),
+	.async = true,
 };
 
 static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..9c5a4d164f09 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -669,6 +669,7 @@ static struct pernet_operations ipip_net_ops = {
 	.exit_batch = ipip_exit_batch_net,
 	.id   = &ipip_net_id,
 	.size = sizeof(struct ip_tunnel_net),
+	.async = true,
 };
 
 static int __init ipip_init(void)
-- 
cgit v1.2.3


From 5c155c50244a0c6e1a7778ae7b4d2753e5e1f617 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:02:03 +0300
Subject: net: Convert ip6gre_net_ops

These pernet_operations are similar to bond_net_ops. Exit method
unregisters all net ip6gre devices, and it looks like another
pernet_operations are not interested in foreign net ip6gre list
or net_generic()->tunnels_wc. Init method registers net device.
So, it's possible to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 3c353125546d..3026662a6413 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1517,6 +1517,7 @@ static struct pernet_operations ip6gre_net_ops = {
 	.exit_batch = ip6gre_exit_batch_net,
 	.id   = &ip6gre_net_id,
 	.size = sizeof(struct ip6gre_net),
+	.async = true,
 };
 
 static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
-- 
cgit v1.2.3


From 66997ba0834ac1b3f22873d349614405e48c69d7 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:02:11 +0300
Subject: net: Convert ip6_tnl_net_ops

These pernet_operations are similar to ip6gre_net_ops. Exit method
unregisters all net ip6_tnl tunnels, and it looks like another
pernet_operations are not interested in foreign net tunnels list.
So, it's possible to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_tunnel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 4b15fe928278..869e2e6750f7 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -2250,6 +2250,7 @@ static struct pernet_operations ip6_tnl_net_ops = {
 	.exit_batch = ip6_tnl_exit_batch_net,
 	.id   = &ip6_tnl_net_id,
 	.size = sizeof(struct ip6_tnl_net),
+	.async = true,
 };
 
 /**
-- 
cgit v1.2.3


From 5ecc29550add41a0f077b07840501d7a3abfc9db Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:02:19 +0300
Subject: net: Convert vti6_net_ops

These pernet_operations are similar to ip6_tnl_net_ops. Exit method
unregisters all net vti6 tunnels, and it looks like another
pernet_operations are not interested in foreign net vti6 list.
Init method registers netdevice. So, it's possible to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_vti.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index fa3ae1cb50d3..c617ea17faa8 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1148,6 +1148,7 @@ static struct pernet_operations vti6_net_ops = {
 	.exit_batch = vti6_exit_batch_net,
 	.id   = &vti6_net_id,
 	.size = sizeof(struct vti6_net),
+	.async = true,
 };
 
 static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
-- 
cgit v1.2.3


From 989d9812b7cabbda2e82f47e52dbc48ed4e40ce5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:02:27 +0300
Subject: net: Convert sit_net_ops

These pernet_operations are similar to ip6_tnl_net_ops. Exit method
unregisters all net sit devices, and it looks like another
pernet_operations are not interested in foreign net sit list.
Init method registers netdevice. So, it's possible to mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/sit.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 3a1775a62973..182db078f01e 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1878,6 +1878,7 @@ static struct pernet_operations sit_net_ops = {
 	.exit_batch = sit_exit_batch_net,
 	.id   = &sit_net_id,
 	.size = sizeof(struct sit_net),
+	.async = true,
 };
 
 static void __exit sit_cleanup(void)
-- 
cgit v1.2.3


From f17c9bf07f9cde430e5c566a5383875ffba104cf Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:02:37 +0300
Subject: net: Convert cfg802154_pernet_ops

These pernet_operations have only exit method, which
moves devices from cfg802154_rdev_list to init_net.
This may occur in any time from nl802154_wpan_phy_netns(),
so we are nice with rtnl_lock() synchronization.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index cb7176cd4cd6..9104943c15ba 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -345,6 +345,7 @@ static void __net_exit cfg802154_pernet_exit(struct net *net)
 
 static struct pernet_operations cfg802154_pernet_ops = {
 	.exit = cfg802154_pernet_exit,
+	.async = true,
 };
 
 static int __init wpan_phy_class_init(void)
-- 
cgit v1.2.3


From 7ca9e67febb1441baa0481fbd46745288338afe6 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:02:56 +0300
Subject: net: Convert brnf_net_ops

These pernet_operations only unregister nf hooks.
So, they are able to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter_hooks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 27f1d4f2114a..484f54150525 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -967,6 +967,7 @@ static struct pernet_operations brnf_net_ops __read_mostly = {
 	.exit = brnf_exit_net,
 	.id   = &brnf_net_id,
 	.size = sizeof(struct brnf_net),
+	.async = true,
 };
 
 static struct notifier_block brnf_notifier __read_mostly = {
-- 
cgit v1.2.3


From f95978b7ad0972610b14c65f13f69b3093ff9101 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:03:05 +0300
Subject: net: Convert clusterip_net_ops

These pernet_operations register and unregister nf hooks,
and populate and destroy /proc entry. So, they are able
to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 4b02ab39ebc5..08b3e48f44fc 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -840,6 +840,7 @@ static struct pernet_operations clusterip_net_ops = {
 	.exit = clusterip_net_exit,
 	.id   = &clusterip_net_id,
 	.size = sizeof(struct clusterip_net),
+	.async = true,
 };
 
 static int __init clusterip_tg_init(void)
-- 
cgit v1.2.3


From e5b2ae93b523304461b2e4ddb59be47f4d4ed1b0 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:03:15 +0300
Subject: net: Convert defrag4_net_ops

These pernet_operations only unregister nf hooks.
So, they are able to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_defrag_ipv4.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index a0d3ad60a411..57244b62a4fc 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -118,6 +118,7 @@ static void __net_exit defrag4_net_exit(struct net *net)
 
 static struct pernet_operations defrag4_net_ops = {
 	.exit = defrag4_net_exit,
+	.async = true,
 };
 
 static int __init nf_defrag_init(void)
-- 
cgit v1.2.3


From afd7b3eb13463693250e82d19f8605312bb5c04d Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:03:36 +0300
Subject: net: Convert ila_net_ops

These pernet_operations register and unregister nf hooks.
Also they populate and depopulate ila_net_id-pointed hash
table. The table is changed by hooks during skb processing
and via netlink request. It looks impossible for another
net pernet_operations to force the table reading or writing,
so, they are able to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ila/ila_xlat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 44c39c5f0638..e438699f000f 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -613,6 +613,7 @@ static struct pernet_operations ila_net_ops = {
 	.exit = ila_exit_net,
 	.id   = &ila_net_id,
 	.size = sizeof(struct ila_net),
+	.async = true,
 };
 
 static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
-- 
cgit v1.2.3


From 9532ce17f7d59d1129165949076491c06f89d1b0 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Feb 2018 16:03:45 +0300
Subject: net: Convert defrag6_net_ops

These pernet_operations only unregister nf hooks.
So, they are able to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index c87b48359e8f..32f98bc06900 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -103,6 +103,7 @@ static void __net_exit defrag6_net_exit(struct net *net)
 
 static struct pernet_operations defrag6_net_ops = {
 	.exit = defrag6_net_exit,
+	.async = true,
 };
 
 static int __init nf_defrag_init(void)
-- 
cgit v1.2.3


From 401910db4cd425899832a093539222b6174f92a2 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 27 Feb 2018 09:52:43 -0800
Subject: rds: deliver zerocopy completion notification with data

This commit is an optimization over commit 01883eda72bd
("rds: support for zcopy completion notification") for PF_RDS sockets.

RDS applications are predominantly request-response transactions, so
it is more efficient to reduce the number of system calls and have
zerocopy completion notification delivered as ancillary data on the
POLLIN channel.

Cookies are passed up as ancillary data (at level SOL_RDS) in a
struct rds_zcopy_cookies when the returned value of recvmsg() is
greater than, or equal to, 0. A max of RDS_MAX_ZCOOKIES may be passed
with each message.

This commit removes support for zerocopy completion notification on
MSG_ERRQUEUE for PF_RDS sockets.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/af_rds.c  |  7 +++++--
 net/rds/message.c | 38 ++++++++++++++++----------------------
 net/rds/rds.h     |  2 ++
 net/rds/recv.c    | 31 ++++++++++++++++++++++++++++++-
 4 files changed, 53 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index a937f18896ae..f7126108a811 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -77,6 +77,7 @@ static int rds_release(struct socket *sock)
 	rds_send_drop_to(rs, NULL);
 	rds_rdma_drop_keys(rs);
 	rds_notify_queue_get(rs, NULL);
+	__skb_queue_purge(&rs->rs_zcookie_queue);
 
 	spin_lock_bh(&rds_sock_lock);
 	list_del_init(&rs->rs_item);
@@ -144,7 +145,7 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
  *  -	to signal that a previously congested destination may have become
  *	uncongested
  *  -	A notification has been queued to the socket (this can be a congestion
- *	update, or a RDMA completion).
+ *	update, or a RDMA completion, or a MSG_ZEROCOPY completion).
  *
  * EPOLLOUT is asserted if there is room on the send queue. This does not mean
  * however, that the next sendmsg() call will succeed. If the application tries
@@ -178,7 +179,8 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
 		spin_unlock(&rs->rs_lock);
 	}
 	if (!list_empty(&rs->rs_recv_queue) ||
-	    !list_empty(&rs->rs_notify_queue))
+	    !list_empty(&rs->rs_notify_queue) ||
+	    !skb_queue_empty(&rs->rs_zcookie_queue))
 		mask |= (EPOLLIN | EPOLLRDNORM);
 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 		mask |= (EPOLLOUT | EPOLLWRNORM);
@@ -513,6 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	INIT_LIST_HEAD(&rs->rs_recv_queue);
 	INIT_LIST_HEAD(&rs->rs_notify_queue);
 	INIT_LIST_HEAD(&rs->rs_cong_list);
+	skb_queue_head_init(&rs->rs_zcookie_queue);
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
 	rs->rs_rx_traces = 0;
diff --git a/net/rds/message.c b/net/rds/message.c
index 651834513481..116cf87ccb89 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -58,32 +58,26 @@ EXPORT_SYMBOL_GPL(rds_message_addref);
 
 static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
 {
-	struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
-	int ncookies;
-	u32 *ptr;
+	struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb;
+	int ncookies = ck->num;
 
-	if (serr->ee.ee_origin != SO_EE_ORIGIN_ZCOOKIE)
+	if (ncookies == RDS_MAX_ZCOOKIES)
 		return false;
-	ncookies = serr->ee.ee_data;
-	if (ncookies == SO_EE_ORIGIN_MAX_ZCOOKIES)
-		return false;
-	ptr = skb_put(skb, sizeof(u32));
-	*ptr = cookie;
-	serr->ee.ee_data = ++ncookies;
+	ck->cookies[ncookies] = cookie;
+	ck->num =  ++ncookies;
 	return true;
 }
 
 static void rds_rm_zerocopy_callback(struct rds_sock *rs,
 				     struct rds_znotifier *znotif)
 {
-	struct sock *sk = rds_rs_to_sk(rs);
 	struct sk_buff *skb, *tail;
-	struct sock_exterr_skb *serr;
 	unsigned long flags;
 	struct sk_buff_head *q;
 	u32 cookie = znotif->z_cookie;
+	struct rds_zcopy_cookies *ck;
 
-	q = &sk->sk_error_queue;
+	q = &rs->rs_zcookie_queue;
 	spin_lock_irqsave(&q->lock, flags);
 	tail = skb_peek_tail(q);
 
@@ -91,22 +85,19 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
 		spin_unlock_irqrestore(&q->lock, flags);
 		mm_unaccount_pinned_pages(&znotif->z_mmp);
 		consume_skb(rds_skb_from_znotifier(znotif));
-		sk->sk_error_report(sk);
+		/* caller invokes rds_wake_sk_sleep() */
 		return;
 	}
 
 	skb = rds_skb_from_znotifier(znotif);
-	serr = SKB_EXT_ERR(skb);
-	memset(&serr->ee, 0, sizeof(serr->ee));
-	serr->ee.ee_errno = 0;
-	serr->ee.ee_origin = SO_EE_ORIGIN_ZCOOKIE;
-	serr->ee.ee_info = 0;
+	ck = (struct rds_zcopy_cookies *)skb->cb;
+	memset(ck, 0, sizeof(*ck));
 	WARN_ON(!skb_zcookie_add(skb, cookie));
 
 	__skb_queue_tail(q, skb);
 
 	spin_unlock_irqrestore(&q->lock, flags);
-	sk->sk_error_report(sk);
+	/* caller invokes rds_wake_sk_sleep() */
 
 	mm_unaccount_pinned_pages(&znotif->z_mmp);
 }
@@ -129,6 +120,7 @@ static void rds_message_purge(struct rds_message *rm)
 		if (rm->data.op_mmp_znotifier) {
 			zcopy = true;
 			rds_rm_zerocopy_callback(rs, rm->data.op_mmp_znotifier);
+			rds_wake_sk_sleep(rs);
 			rm->data.op_mmp_znotifier = NULL;
 		}
 		sock_put(rds_rs_to_sk(rs));
@@ -362,10 +354,12 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
 		int total_copied = 0;
 		struct sk_buff *skb;
 
-		skb = alloc_skb(SO_EE_ORIGIN_MAX_ZCOOKIES * sizeof(u32),
-				GFP_KERNEL);
+		skb = alloc_skb(0, GFP_KERNEL);
 		if (!skb)
 			return -ENOMEM;
+		BUILD_BUG_ON(sizeof(skb->cb) <
+			     max_t(int, sizeof(struct rds_znotifier),
+				   sizeof(struct rds_zcopy_cookies)));
 		rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
 		if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
 					    length)) {
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 31cd38852050..33b16353d8f3 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -603,6 +603,8 @@ struct rds_sock {
 	/* Socket receive path trace points*/
 	u8			rs_rx_traces;
 	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+
+	struct sk_buff_head	rs_zcookie_queue;
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index b080961464df..d50747725221 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -577,6 +577,32 @@ out:
 	return ret;
 }
 
+static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
+{
+	struct sk_buff *skb;
+	struct sk_buff_head *q = &rs->rs_zcookie_queue;
+	struct rds_zcopy_cookies *done;
+
+	if (!msg->msg_control)
+		return false;
+
+	if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) ||
+	    msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
+		return false;
+
+	skb = skb_dequeue(q);
+	if (!skb)
+		return false;
+	done = (struct rds_zcopy_cookies *)skb->cb;
+	if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
+		     done)) {
+		skb_queue_head(q, skb);
+		return false;
+	}
+	consume_skb(skb);
+	return true;
+}
+
 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 		int msg_flags)
 {
@@ -611,7 +637,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 		if (!rds_next_incoming(rs, &inc)) {
 			if (nonblock) {
-				ret = -EAGAIN;
+				bool reaped = rds_recvmsg_zcookie(rs, msg);
+
+				ret = reaped ?  0 : -EAGAIN;
 				break;
 			}
 
@@ -660,6 +688,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 			ret = -EFAULT;
 			goto out;
 		}
+		rds_recvmsg_zcookie(rs, msg);
 
 		rds_stats_inc(s_recv_delivered);
 
-- 
cgit v1.2.3


From d1b2a6c4bed99fc7e8a11e7abcff19293d1974f5 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Tue, 27 Feb 2018 14:53:37 +0100
Subject: net: GRE: Add is_gretap_dev, is_ip6gretap_dev

Determining whether a device is a GRE device is easily done by
inspecting struct net_device.type. However, for the tap variants, the
type is just ARPHRD_ETHER.

Therefore introduce two predicate functions that use netdev_ops to tell
the tap devices.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 6 ++++++
 net/ipv6/ip6_gre.c | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index e496afa47709..0fe1d69b5df4 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1323,6 +1323,12 @@ static void ipgre_tap_setup(struct net_device *dev)
 	ip_tunnel_setup(dev, gre_tap_net_id);
 }
 
+bool is_gretap_dev(const struct net_device *dev)
+{
+	return dev->netdev_ops == &gre_tap_netdev_ops;
+}
+EXPORT_SYMBOL_GPL(is_gretap_dev);
+
 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
 			 struct nlattr *tb[], struct nlattr *data[],
 			 struct netlink_ext_ack *extack)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 3026662a6413..4f150a394387 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1785,6 +1785,12 @@ static void ip6gre_tap_setup(struct net_device *dev)
 	netif_keep_dst(dev);
 }
 
+bool is_ip6gretap_dev(const struct net_device *dev)
+{
+	return dev->netdev_ops == &ip6gre_tap_netdev_ops;
+}
+EXPORT_SYMBOL_GPL(is_ip6gretap_dev);
+
 static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
 				       struct ip_tunnel_encap *ipencap)
 {
-- 
cgit v1.2.3


From b0066da52ea53bae2b4ceed3f47d488df27dab66 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Tue, 27 Feb 2018 14:53:38 +0100
Subject: ip_tunnel: Rename & publish init_tunnel_flow

Initializing struct flowi4 is useful for drivers that need to emulate
routing decisions made by a tunnel interface. Publish the
function (appropriately renamed) so that the drivers in question don't
need to cut'n'paste it around.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_tunnel.c | 40 ++++++++++++----------------------------
 1 file changed, 12 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d786a8441bce..b2117d89bc83 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -290,22 +290,6 @@ failed:
 	return ERR_PTR(err);
 }
 
-static inline void init_tunnel_flow(struct flowi4 *fl4,
-				    int proto,
-				    __be32 daddr, __be32 saddr,
-				    __be32 key, __u8 tos, int oif,
-				    __u32 mark)
-{
-	memset(fl4, 0, sizeof(*fl4));
-	fl4->flowi4_oif = oif;
-	fl4->daddr = daddr;
-	fl4->saddr = saddr;
-	fl4->flowi4_tos = tos;
-	fl4->flowi4_proto = proto;
-	fl4->fl4_gre_key = key;
-	fl4->flowi4_mark = mark;
-}
-
 static int ip_tunnel_bind_dev(struct net_device *dev)
 {
 	struct net_device *tdev = NULL;
@@ -322,10 +306,10 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 		struct flowi4 fl4;
 		struct rtable *rt;
 
-		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
-				 iph->saddr, tunnel->parms.o_key,
-				 RT_TOS(iph->tos), tunnel->parms.link,
-				 tunnel->fwmark);
+		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
+				    iph->saddr, tunnel->parms.o_key,
+				    RT_TOS(iph->tos), tunnel->parms.link,
+				    tunnel->fwmark);
 		rt = ip_route_output_key(tunnel->net, &fl4);
 
 		if (!IS_ERR(rt)) {
@@ -581,8 +565,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
 		else if (skb->protocol == htons(ETH_P_IPV6))
 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
 	}
-	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
-			 RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
+	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
+			    RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
 		goto tx_error;
 	rt = ip_route_output_key(tunnel->net, &fl4);
@@ -711,14 +695,14 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 	}
 
 	if (tunnel->fwmark) {
-		init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
-				 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
-				 tunnel->fwmark);
+		ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
+				    tunnel->parms.o_key, RT_TOS(tos),
+				    tunnel->parms.link, tunnel->fwmark);
 	}
 	else {
-		init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
-				 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
-				 skb->mark);
+		ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
+				    tunnel->parms.o_key, RT_TOS(tos),
+				    tunnel->parms.link, skb->mark);
 	}
 
 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
-- 
cgit v1.2.3


From 82695b30ffeeab665f41416c6f5015dea3147bd5 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 27 Feb 2018 15:48:21 -0800
Subject: inet: whitespace cleanup

Ran simple script to find/remove trailing whitespace and blank lines
at EOF because that kind of stuff git whines about and editors leave
behind.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 +-
 net/ipv4/proc.c          | 1 -
 net/ipv4/tunnel4.c       | 2 +-
 net/ipv4/xfrm4_policy.c  | 1 -
 net/ipv6/anycast.c       | 1 -
 net/ipv6/exthdrs_core.c  | 1 -
 net/ipv6/ipv6_sockglue.c | 1 -
 net/ipv6/proc.c          | 1 -
 net/ipv6/xfrm6_state.c   | 1 -
 9 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index cd46d7666598..f31e6575ab91 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -171,7 +171,7 @@ static void free_nh_exceptions(struct fib_nh *nh)
 		fnhe = rcu_dereference_protected(hash[i].chain, 1);
 		while (fnhe) {
 			struct fib_nh_exception *next;
-			
+
 			next = rcu_dereference_protected(fnhe->fnhe_next, 1);
 
 			rt_fibinfo_free(&fnhe->fnhe_rth_input);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index fdabc70283b6..d97e83b2dd33 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -556,4 +556,3 @@ int __init ip_misc_proc_init(void)
 {
 	return register_pernet_subsys(&ip_proc_ops);
 }
-
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index ec35eaa5c029..c0630013c1ae 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -90,7 +90,7 @@ EXPORT_SYMBOL(xfrm4_tunnel_deregister);
 	for (handler = rcu_dereference(head);		\
 	     handler != NULL;				\
 	     handler = rcu_dereference(handler->next))	\
-	
+
 static int tunnel4_rcv(struct sk_buff *skb)
 {
 	struct xfrm_tunnel *handler;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 796ac4115485..0c752dc3f93b 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -379,4 +379,3 @@ void __init xfrm4_init(void)
 	xfrm4_protocol_init();
 	register_pernet_subsys(&xfrm4_net_ops);
 }
-
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 8e085cc05aeb..d7d0abc7fd0e 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -552,4 +552,3 @@ void ac6_proc_exit(struct net *net)
 	remove_proc_entry("anycast6", net->proc_net);
 }
 #endif
-
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 11025f8d124b..b643f5ce6c80 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -279,4 +279,3 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
 	return nexthdr;
 }
 EXPORT_SYMBOL(ipv6_find_hdr);
-
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 24535169663d..4d780c7f0130 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1415,4 +1415,3 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
 }
 EXPORT_SYMBOL(compat_ipv6_getsockopt);
 #endif
-
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index b8858c546f41..1678cf037688 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -355,4 +355,3 @@ void ipv6_misc_proc_exit(void)
 {
 	unregister_pernet_subsys(&ipv6_proc_ops);
 }
-
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index b15075a5c227..16f434791763 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -196,4 +196,3 @@ void xfrm6_state_fini(void)
 {
 	xfrm_state_unregister_afinfo(&xfrm6_state_afinfo);
 }
-
-- 
cgit v1.2.3


From fd5ac14a1aae63ef95b22cc2eb23e3a25b3436be Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Wed, 28 Feb 2018 10:45:03 +0100
Subject: net: sch: Don't warn on missmatching qlen and backlog for offloaded
 qdiscs

Offloaded qdiscs are allowed to expose only parts of their statistics.
It means that if backlog is being exposed and qlen is not, it might trigger
a warning in qdisc_tree_reduce_backlog.
Do not warn in case the qdisc that was removed was an offloaded one.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_api.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 27e672c12492..68f9d942bed4 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -739,6 +739,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
 			       unsigned int len)
 {
+	bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
 	const struct Qdisc_class_ops *cops;
 	unsigned long cl;
 	u32 parentid;
@@ -760,8 +761,12 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
 		 * If child was empty even before update then backlog
 		 * counter is screwed and we skip notification because
 		 * parent class is already passive.
+		 *
+		 * If the original child was offloaded then it is allowed
+		 * to be seem as empty, so the parent is notified anyway.
 		 */
-		notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
+		notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
+						       !qdisc_is_offloaded);
 		/* TODO: perform the search on a per txq basis */
 		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
 		if (sch == NULL) {
-- 
cgit v1.2.3


From 98ceb7b6d64552f995973be1a0ee9af0bf85fb3d Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Wed, 28 Feb 2018 10:45:05 +0100
Subject: mlxsw: spectrum: qdiscs: prio: Delete child qdiscs when removing
 bands

When the number the bands of sch_prio is decreased, child qdiscs on the
deleted bands would get deleted as well.
This change and deletions are being done under sch_tree_lock of the
sch_prio qdisc. Part of the destruction of qdisc is unoffloading it, if
it is offloaded. Un-offloading can't be done inside this lock.
Move the offload command to be done before reducing the number of bands,
so unoffloading of the qdiscs that are about to be deleted could be done
outside of the lock.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_prio.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index efbf51f35778..ba2d6d17d95a 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -142,9 +142,8 @@ prio_reset(struct Qdisc *sch)
 	sch->q.qlen = 0;
 }
 
-static int prio_offload(struct Qdisc *sch, bool enable)
+static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt)
 {
-	struct prio_sched_data *q = qdisc_priv(sch);
 	struct net_device *dev = qdisc_dev(sch);
 	struct tc_prio_qopt_offload opt = {
 		.handle = sch->handle,
@@ -154,10 +153,10 @@ static int prio_offload(struct Qdisc *sch, bool enable)
 	if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
 		return -EOPNOTSUPP;
 
-	if (enable) {
+	if (qopt) {
 		opt.command = TC_PRIO_REPLACE;
-		opt.replace_params.bands = q->bands;
-		memcpy(&opt.replace_params.priomap, q->prio2band,
+		opt.replace_params.bands = qopt->bands;
+		memcpy(&opt.replace_params.priomap, qopt->priomap,
 		       TC_PRIO_MAX + 1);
 		opt.replace_params.qstats = &sch->qstats;
 	} else {
@@ -174,7 +173,7 @@ prio_destroy(struct Qdisc *sch)
 	struct prio_sched_data *q = qdisc_priv(sch);
 
 	tcf_block_put(q->block);
-	prio_offload(sch, false);
+	prio_offload(sch, NULL);
 	for (prio = 0; prio < q->bands; prio++)
 		qdisc_destroy(q->queues[prio]);
 }
@@ -211,6 +210,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 		}
 	}
 
+	prio_offload(sch, qopt);
 	sch_tree_lock(sch);
 	q->bands = qopt->bands;
 	memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
@@ -230,7 +230,6 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt,
 	}
 
 	sch_tree_unlock(sch);
-	prio_offload(sch, true);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b9c7a7acc749f3d0667a2ab44ea38110d5a1f286 Mon Sep 17 00:00:00 2001
From: Nogah Frankel <nogahf@mellanox.com>
Date: Wed, 28 Feb 2018 10:45:06 +0100
Subject: net: sch: prio: Add offload ability for grafting a child

Offload sch_prio graft command for capable drivers.
Warn in case of a failure, unless the graft was done as part of a destroy
operation (the new qdisc is a noop) or if all the qdiscs (the parent, the
old child, and the new one) are not offloaded.

Signed-off-by: Nogah Frankel <nogahf@mellanox.com>
Reviewed-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_prio.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'net')

diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index ba2d6d17d95a..222e53d3d27a 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -308,12 +308,44 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 		      struct Qdisc **old, struct netlink_ext_ack *extack)
 {
 	struct prio_sched_data *q = qdisc_priv(sch);
+	struct tc_prio_qopt_offload graft_offload;
+	struct net_device *dev = qdisc_dev(sch);
 	unsigned long band = arg - 1;
+	bool any_qdisc_is_offloaded;
+	int err;
 
 	if (new == NULL)
 		new = &noop_qdisc;
 
 	*old = qdisc_replace(sch, new, &q->queues[band]);
+
+	if (!tc_can_offload(dev))
+		return 0;
+
+	graft_offload.handle = sch->handle;
+	graft_offload.parent = sch->parent;
+	graft_offload.graft_params.band = band;
+	graft_offload.graft_params.child_handle = new->handle;
+	graft_offload.command = TC_PRIO_GRAFT;
+
+	err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO,
+					    &graft_offload);
+
+	/* Don't report error if the graft is part of destroy operation. */
+	if (err && new != &noop_qdisc) {
+		/* Don't report error if the parent, the old child and the new
+		 * one are not offloaded.
+		 */
+		any_qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
+		any_qdisc_is_offloaded |= new->flags & TCQ_F_OFFLOADED;
+		if (*old)
+			any_qdisc_is_offloaded |= (*old)->flags &
+						   TCQ_F_OFFLOADED;
+
+		if (any_qdisc_is_offloaded)
+			NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From bfff4862653bb96001ab57c1edd6d03f48e5f035 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 28 Feb 2018 22:40:16 -0500
Subject: net: fib_rules: support for match on ip_proto, sport and dport

uapi for ip_proto, sport and dport range match
in fib rules.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index a6aea805a0a2..f6f04fc0f629 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -33,6 +33,10 @@ bool fib_rule_matchall(const struct fib_rule *rule)
 	if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
 	    !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
 		return false;
+	if (fib_rule_port_range_set(&rule->sport_range))
+		return false;
+	if (fib_rule_port_range_set(&rule->dport_range))
+		return false;
 	return true;
 }
 EXPORT_SYMBOL_GPL(fib_rule_matchall);
@@ -221,6 +225,26 @@ static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
 	return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
 }
 
+static int nla_get_port_range(struct nlattr *pattr,
+			      struct fib_rule_port_range *port_range)
+{
+	const struct fib_rule_port_range *pr = nla_data(pattr);
+
+	if (!fib_rule_port_range_valid(pr))
+		return -EINVAL;
+
+	port_range->start = pr->start;
+	port_range->end = pr->end;
+
+	return 0;
+}
+
+static int nla_put_port_range(struct sk_buff *skb, int attrtype,
+			      struct fib_rule_port_range *range)
+{
+	return nla_put(skb, attrtype, sizeof(*range), range);
+}
+
 static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 			  struct flowi *fl, int flags,
 			  struct fib_lookup_arg *arg)
@@ -425,6 +449,17 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
 		    !uid_eq(r->uid_range.end, rule->uid_range.end))
 			continue;
 
+		if (r->ip_proto != rule->ip_proto)
+			continue;
+
+		if (!fib_rule_port_range_compare(&r->sport_range,
+						 &rule->sport_range))
+			continue;
+
+		if (!fib_rule_port_range_compare(&r->dport_range,
+						 &rule->dport_range))
+			continue;
+
 		if (!ops->compare(r, frh, tb))
 			continue;
 		return 1;
@@ -569,6 +604,23 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		rule->uid_range = fib_kuid_range_unset;
 	}
 
+	if (tb[FRA_IP_PROTO])
+		rule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);
+
+	if (tb[FRA_SPORT_RANGE]) {
+		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+					 &rule->sport_range);
+		if (err)
+			goto errout_free;
+	}
+
+	if (tb[FRA_DPORT_RANGE]) {
+		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+					 &rule->dport_range);
+		if (err)
+			goto errout_free;
+	}
+
 	if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
 	    rule_exists(ops, frh, tb, rule)) {
 		err = -EEXIST;
@@ -634,6 +686,8 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 {
 	struct net *net = sock_net(skb->sk);
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
+	struct fib_rule_port_range sprange = {0, 0};
+	struct fib_rule_port_range dprange = {0, 0};
 	struct fib_rules_ops *ops = NULL;
 	struct fib_rule *rule, *r;
 	struct nlattr *tb[FRA_MAX+1];
@@ -667,6 +721,20 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		range = fib_kuid_range_unset;
 	}
 
+	if (tb[FRA_SPORT_RANGE]) {
+		err = nla_get_port_range(tb[FRA_SPORT_RANGE],
+					 &sprange);
+		if (err)
+			goto errout;
+	}
+
+	if (tb[FRA_DPORT_RANGE]) {
+		err = nla_get_port_range(tb[FRA_DPORT_RANGE],
+					 &dprange);
+		if (err)
+			goto errout;
+	}
+
 	list_for_each_entry(rule, &ops->rules_list, list) {
 		if (tb[FRA_PROTOCOL] &&
 		    (rule->proto != nla_get_u8(tb[FRA_PROTOCOL])))
@@ -712,6 +780,18 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 		     !uid_eq(rule->uid_range.end, range.end)))
 			continue;
 
+		if (tb[FRA_IP_PROTO] &&
+		    (rule->ip_proto != nla_get_u8(tb[FRA_IP_PROTO])))
+			continue;
+
+		if (fib_rule_port_range_set(&sprange) &&
+		    !fib_rule_port_range_compare(&rule->sport_range, &sprange))
+			continue;
+
+		if (fib_rule_port_range_set(&dprange) &&
+		    !fib_rule_port_range_compare(&rule->dport_range, &dprange))
+			continue;
+
 		if (!ops->compare(rule, frh, tb))
 			continue;
 
@@ -790,7 +870,10 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
 			 + nla_total_size(4) /* FRA_FWMASK */
 			 + nla_total_size_64bit(8) /* FRA_TUN_ID */
 			 + nla_total_size(sizeof(struct fib_kuid_range))
-			 + nla_total_size(1); /* FRA_PROTOCOL */
+			 + nla_total_size(1) /* FRA_PROTOCOL */
+			 + nla_total_size(1) /* FRA_IP_PROTO */
+			 + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
+			 + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */
 
 	if (ops->nlmsg_payload)
 		payload += ops->nlmsg_payload(rule);
@@ -855,7 +938,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
 	    (rule->l3mdev &&
 	     nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
 	    (uid_range_set(&rule->uid_range) &&
-	     nla_put_uid_range(skb, &rule->uid_range)))
+	     nla_put_uid_range(skb, &rule->uid_range)) ||
+	    (fib_rule_port_range_set(&rule->sport_range) &&
+	     nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+	    (fib_rule_port_range_set(&rule->dport_range) &&
+	     nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+	    (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
 		goto nla_put_failure;
 
 	if (rule->suppress_ifgroup != -1) {
-- 
cgit v1.2.3


From 4a2d73a4fb36ed4d73967bd3892cfab014a52ab2 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 28 Feb 2018 22:41:06 -0500
Subject: ipv4: fib_rules: support match on sport, dport and ip proto

support to match on src port, dst port and ip protocol.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_rules.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 35d646a62ad4..16083b82954b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -182,6 +182,17 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 	if (r->tos && (r->tos != fl4->flowi4_tos))
 		return 0;
 
+	if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
+		return 0;
+
+	if (fib_rule_port_range_set(&rule->sport_range) &&
+	    !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport))
+		return 0;
+
+	if (fib_rule_port_range_set(&rule->dport_range) &&
+	    !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport))
+		return 0;
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From bb0ad1987e963e47a02cc53b8275ffe2b38d4b70 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 28 Feb 2018 22:41:37 -0500
Subject: ipv6: fib6_rules: support for match on sport, dport and ip proto

support to match on src port, dst port and ip protocol.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 95a2c9e8699a..bcd1f22ac7b1 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -223,6 +223,17 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
 	if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
 		return 0;
 
+	if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
+		return 0;
+
+	if (fib_rule_port_range_set(&rule->sport_range) &&
+	    !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport))
+		return 0;
+
+	if (fib_rule_port_range_set(&rule->dport_range) &&
+	    !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport))
+		return 0;
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From e37b1e978bec5334dc379d8c2423d063af207430 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 28 Feb 2018 22:42:41 -0500
Subject: ipv6: route: dissect flow in input path if fib rules need it

Dissect flow in fwd path if fib rules require it. Controlled by
a flag to avoid penatly for the common case. Flag is set when fib
rules with sport, dport and proto match that require flow dissect
are installed. Also passes the dissected hash keys to the multipath
hash function when applicable to avoid dissecting the flow again.
icmp packets will continue to use inner header for hash
calculations (Thanks to Nikolay Aleksandrov for some review here).

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_rules.c     |  8 ++++++++
 net/ipv4/fib_semantics.c |  2 +-
 net/ipv4/route.c         | 43 +++++++++++++++++++++++++++++--------------
 3 files changed, 38 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 16083b82954b..737d11bc8838 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -255,6 +255,9 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	}
 #endif
 
+	if (fib_rule_requires_fldissect(rule))
+		net->ipv4.fib_rules_require_fldissect++;
+
 	rule4->src_len = frh->src_len;
 	rule4->srcmask = inet_make_mask(rule4->src_len);
 	rule4->dst_len = frh->dst_len;
@@ -283,6 +286,10 @@ static int fib4_rule_delete(struct fib_rule *rule)
 		net->ipv4.fib_num_tclassid_users--;
 #endif
 	net->ipv4.fib_has_custom_rules = true;
+
+	if (net->ipv4.fib_rules_require_fldissect &&
+	    fib_rule_requires_fldissect(rule))
+		net->ipv4.fib_rules_require_fldissect--;
 errout:
 	return err;
 }
@@ -400,6 +407,7 @@ int __net_init fib4_rules_init(struct net *net)
 		goto fail;
 	net->ipv4.rules_ops = ops;
 	net->ipv4.fib_has_custom_rules = false;
+	net->ipv4.fib_rules_require_fldissect = 0;
 	return 0;
 
 fail:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f31e6575ab91..181b0d8d589c 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1770,7 +1770,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi->fib_nhs > 1) {
-		int h = fib_multipath_hash(res->fi, fl4, skb);
+		int h = fib_multipath_hash(res->fi, fl4, skb, NULL);
 
 		fib_select_multipath(res, h);
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 26eefa2eaa44..3bb686dac273 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1783,7 +1783,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
 
 /* if skb is set it will be used and fl4 can be NULL */
 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
-		       const struct sk_buff *skb)
+		       const struct sk_buff *skb, struct flow_keys *flkeys)
 {
 	struct net *net = fi->fib_net;
 	struct flow_keys hash_keys;
@@ -1810,14 +1810,23 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
 			if (skb->l4_hash)
 				return skb_get_hash_raw(skb) >> 1;
 			memset(&hash_keys, 0, sizeof(hash_keys));
-			skb_flow_dissect_flow_keys(skb, &keys, flag);
 
-			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-			hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
-			hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
-			hash_keys.ports.src = keys.ports.src;
-			hash_keys.ports.dst = keys.ports.dst;
-			hash_keys.basic.ip_proto = keys.basic.ip_proto;
+			if (flkeys) {
+				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+				hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+				hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+				hash_keys.ports.src = flkeys->ports.src;
+				hash_keys.ports.dst = flkeys->ports.dst;
+				hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+			} else {
+				skb_flow_dissect_flow_keys(skb, &keys, flag);
+				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+				hash_keys.ports.src = keys.ports.src;
+				hash_keys.ports.dst = keys.ports.dst;
+				hash_keys.basic.ip_proto = keys.basic.ip_proto;
+			}
 		} else {
 			memset(&hash_keys, 0, sizeof(hash_keys));
 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -1838,11 +1847,12 @@ int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
 static int ip_mkroute_input(struct sk_buff *skb,
 			    struct fib_result *res,
 			    struct in_device *in_dev,
-			    __be32 daddr, __be32 saddr, u32 tos)
+			    __be32 daddr, __be32 saddr, u32 tos,
+			    struct flow_keys *hkeys)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1) {
-		int h = fib_multipath_hash(res->fi, NULL, skb);
+		int h = fib_multipath_hash(res->fi, NULL, skb, hkeys);
 
 		fib_select_multipath(res, h);
 	}
@@ -1868,13 +1878,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 			       struct fib_result *res)
 {
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct flow_keys *flkeys = NULL, _flkeys;
+	struct net    *net = dev_net(dev);
 	struct ip_tunnel_info *tun_info;
-	struct flowi4	fl4;
+	int		err = -EINVAL;
 	unsigned int	flags = 0;
 	u32		itag = 0;
 	struct rtable	*rth;
-	int		err = -EINVAL;
-	struct net    *net = dev_net(dev);
+	struct flowi4	fl4;
 	bool do_cache;
 
 	/* IP on this device is disabled. */
@@ -1933,6 +1944,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	fl4.daddr = daddr;
 	fl4.saddr = saddr;
 	fl4.flowi4_uid = sock_net_uid(net, NULL);
+
+	if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
+		flkeys = &_flkeys;
+
 	err = fib_lookup(net, &fl4, res, 0);
 	if (err != 0) {
 		if (!IN_DEV_FORWARD(in_dev))
@@ -1958,7 +1973,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (res->type != RTN_UNICAST)
 		goto martian_destination;
 
-	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+	err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
 out:	return err;
 
 brd_input:
-- 
cgit v1.2.3


From 5e5d6fed374155ba1a7a5ca5f12fbec2285d06a2 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 28 Feb 2018 22:43:22 -0500
Subject: ipv6: route: dissect flow in input path if fib rules need it

Dissect flow in fwd path if fib rules require it. Controlled by
a flag to avoid penatly for the common case. Flag is set when fib
rules with sport, dport and proto match that require flow dissect
are installed. Also passes the dissected hash keys to the multipath
hash function when applicable to avoid dissecting the flow again.
icmp packets will continue to use inner header for hash
calculations.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/fib6_rules.c | 16 ++++++++++++++++
 net/ipv6/icmp.c       |  2 +-
 net/ipv6/route.c      | 34 +++++++++++++++++++++++++---------
 3 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index bcd1f22ac7b1..04e5f523e50f 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -269,12 +269,26 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
 	rule6->dst.plen = frh->dst_len;
 	rule6->tclass = frh->tos;
 
+	if (fib_rule_requires_fldissect(rule))
+		net->ipv6.fib6_rules_require_fldissect++;
+
 	net->ipv6.fib6_has_custom_rules = true;
 	err = 0;
 errout:
 	return err;
 }
 
+static int fib6_rule_delete(struct fib_rule *rule)
+{
+	struct net *net = rule->fr_net;
+
+	if (net->ipv6.fib6_rules_require_fldissect &&
+	    fib_rule_requires_fldissect(rule))
+		net->ipv6.fib6_rules_require_fldissect--;
+
+	return 0;
+}
+
 static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
 			     struct nlattr **tb)
 {
@@ -334,6 +348,7 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
 	.match			= fib6_rule_match,
 	.suppress		= fib6_rule_suppress,
 	.configure		= fib6_rule_configure,
+	.delete			= fib6_rule_delete,
 	.compare		= fib6_rule_compare,
 	.fill			= fib6_rule_fill,
 	.nlmsg_payload		= fib6_rule_nlmsg_payload,
@@ -361,6 +376,7 @@ static int __net_init fib6_rules_net_init(struct net *net)
 		goto out_fib6_rules_ops;
 
 	net->ipv6.fib6_rules_ops = ops;
+	net->ipv6.fib6_rules_require_fldissect = 0;
 out:
 	return err;
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 4fa4f1b150a4..b0778d323b6e 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	fl6.fl6_icmp_type = type;
 	fl6.fl6_icmp_code = code;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
-	fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
+	fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index aa709b644945..e2bb40824c85 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -460,7 +460,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	 * case it will always be non-zero. Otherwise now is the time to do it.
 	 */
 	if (!fl6->mp_hash)
-		fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
+		fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL);
 
 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
 		return match;
@@ -1786,10 +1786,12 @@ struct dst_entry *ip6_route_input_lookup(struct net *net,
 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
-				  struct flow_keys *keys)
+				  struct flow_keys *keys,
+				  struct flow_keys *flkeys)
 {
 	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
 	const struct ipv6hdr *key_iph = outer_iph;
+	struct flow_keys *_flkeys = flkeys;
 	const struct ipv6hdr *inner_iph;
 	const struct icmp6hdr *icmph;
 	struct ipv6hdr _inner_iph;
@@ -1811,22 +1813,31 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
 		goto out;
 
 	key_iph = inner_iph;
+	_flkeys = NULL;
 out:
 	memset(keys, 0, sizeof(*keys));
 	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-	keys->addrs.v6addrs.src = key_iph->saddr;
-	keys->addrs.v6addrs.dst = key_iph->daddr;
-	keys->tags.flow_label = ip6_flowinfo(key_iph);
-	keys->basic.ip_proto = key_iph->nexthdr;
+	if (_flkeys) {
+		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
+		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
+		keys->tags.flow_label = _flkeys->tags.flow_label;
+		keys->basic.ip_proto = _flkeys->basic.ip_proto;
+	} else {
+		keys->addrs.v6addrs.src = key_iph->saddr;
+		keys->addrs.v6addrs.dst = key_iph->daddr;
+		keys->tags.flow_label = ip6_flowinfo(key_iph);
+		keys->basic.ip_proto = key_iph->nexthdr;
+	}
 }
 
 /* if skb is set it will be used and fl6 can be NULL */
-u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
+u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
+		       struct flow_keys *flkeys)
 {
 	struct flow_keys hash_keys;
 
 	if (skb) {
-		ip6_multipath_l3_keys(skb, &hash_keys);
+		ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
 		return flow_hash_from_keys(&hash_keys) >> 1;
 	}
 
@@ -1847,12 +1858,17 @@ void ip6_route_input(struct sk_buff *skb)
 		.flowi6_mark = skb->mark,
 		.flowi6_proto = iph->nexthdr,
 	};
+	struct flow_keys *flkeys = NULL, _flkeys;
 
 	tun_info = skb_tunnel_info(skb);
 	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
 		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
+
+	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
+		flkeys = &_flkeys;
+
 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
-		fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
+		fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys);
 	skb_dst_drop(skb);
 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
-- 
cgit v1.2.3


From 6853f21f764b04e58df5e44629fec1fb8f3cbf2e Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:29 +0200
Subject: ipmr,ipmr6: Define a uniform vif_device

The two implementations have almost identical structures - vif_device and
mif_device. As a step toward uniforming the mr_tables, eliminate the
mif_device and relocate the vif_device definition into a new common
header file.

Also, introduce a common initializing function for setting most of the
vif_device fields in a new common source file. This requires modifying
the ipv{4,6] Kconfig and ipv4 makefile as we're introducing a new common
config option - CONFIG_IP_MROUTE_COMMON.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/Kconfig     |  5 +++++
 net/ipv4/Makefile    |  1 +
 net/ipv4/ipmr.c      | 32 +++++++++++++++-----------------
 net/ipv4/ipmr_base.c | 28 ++++++++++++++++++++++++++++
 net/ipv6/Kconfig     |  1 +
 net/ipv6/ip6mr.c     | 37 +++++++++++++------------------------
 6 files changed, 63 insertions(+), 41 deletions(-)
 create mode 100644 net/ipv4/ipmr_base.c

(limited to 'net')

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index f48fe6fc7e8c..80dad301361d 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -212,9 +212,14 @@ config NET_IPGRE_BROADCAST
 	  Network), but can be distributed all over the Internet. If you want
 	  to do that, say Y here and to "IP multicast routing" below.
 
+config IP_MROUTE_COMMON
+	bool
+	depends on IP_MROUTE || IPV6_MROUTE
+
 config IP_MROUTE
 	bool "IP: multicast routing"
 	depends on IP_MULTICAST
+	select IP_MROUTE_COMMON
 	help
 	  This is used if you want your machine to act as a router for IP
 	  packets that have several destination addresses. It is needed on the
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 47a0a6649a9d..a07b7dd06def 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
 gre-y := gre_demux.o
 obj-$(CONFIG_NET_FOU) += fou.o
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 591d1fc80a1f..1370edad64bf 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -945,6 +945,10 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	ip_rt_multicast_event(in_dev);
 
 	/* Fill in the VIF structures */
+	vif_device_init(v, dev, vifc->vifc_rate_limit,
+			vifc->vifc_threshold,
+			vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
+			(VIFF_TUNNEL | VIFF_REGISTER));
 
 	attr.orig_dev = dev;
 	if (!switchdev_port_attr_get(dev, &attr)) {
@@ -953,20 +957,9 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	} else {
 		v->dev_parent_id.id_len = 0;
 	}
-	v->rate_limit = vifc->vifc_rate_limit;
+
 	v->local = vifc->vifc_lcl_addr.s_addr;
 	v->remote = vifc->vifc_rmt_addr.s_addr;
-	v->flags = vifc->vifc_flags;
-	if (!mrtsock)
-		v->flags |= VIFF_STATIC;
-	v->threshold = vifc->vifc_threshold;
-	v->bytes_in = 0;
-	v->bytes_out = 0;
-	v->pkt_in = 0;
-	v->pkt_out = 0;
-	v->link = dev->ifindex;
-	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
-		v->link = dev_get_iflink(dev);
 
 	/* And finish update writing critical data */
 	write_lock_bh(&mrt_lock);
@@ -2316,7 +2309,8 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	}
 
 	if (VIF_EXISTS(mrt, c->mfc_parent) &&
-	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+	    nla_put_u32(skb, RTA_IIF,
+			mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
 		return -EMSGSIZE;
 
 	if (c->mfc_flags & MFC_OFFLOAD)
@@ -2327,6 +2321,8 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
 		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			struct vif_device *vif;
+
 			if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
 				nla_nest_cancel(skb, mp_attr);
 				return -EMSGSIZE;
@@ -2334,7 +2330,8 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 
 			nhp->rtnh_flags = 0;
 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
-			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
+			vif = &mrt->vif_table[ct];
+			nhp->rtnh_ifindex = vif->dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
 	}
@@ -2954,8 +2951,8 @@ struct ipmr_vif_iter {
 };
 
 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
-					   struct ipmr_vif_iter *iter,
-					   loff_t pos)
+					    struct ipmr_vif_iter *iter,
+					    loff_t pos)
 {
 	struct mr_table *mrt = iter->mrt;
 
@@ -3020,7 +3017,8 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
 	} else {
 		const struct vif_device *vif = v;
-		const char *name =  vif->dev ? vif->dev->name : "none";
+		const char *name =  vif->dev ?
+				    vif->dev->name : "none";
 
 		seq_printf(seq,
 			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
new file mode 100644
index 000000000000..22758f848344
--- /dev/null
+++ b/net/ipv4/ipmr_base.c
@@ -0,0 +1,28 @@
+/* Linux multicast routing support
+ * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation
+ */
+
+#include <linux/mroute_base.h>
+
+/* Sets everything common except 'dev', since that is done under locking */
+void vif_device_init(struct vif_device *v,
+		     struct net_device *dev,
+		     unsigned long rate_limit,
+		     unsigned char threshold,
+		     unsigned short flags,
+		     unsigned short get_iflink_mask)
+{
+	v->dev = NULL;
+	v->bytes_in = 0;
+	v->bytes_out = 0;
+	v->pkt_in = 0;
+	v->pkt_out = 0;
+	v->rate_limit = rate_limit;
+	v->flags = flags;
+	v->threshold = threshold;
+	if (v->flags & get_iflink_mask)
+		v->link = dev_get_iflink(dev);
+	else
+		v->link = dev->ifindex;
+}
+EXPORT_SYMBOL(vif_device_init);
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ea71e4b0ab7a..6794ddf0547c 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -278,6 +278,7 @@ config IPV6_SUBTREES
 config IPV6_MROUTE
 	bool "IPv6: multicast routing"
 	depends on IPV6
+	select IP_MROUTE_COMMON
 	---help---
 	  Experimental support for IPv6 multicast forwarding.
 	  If unsure, say N.
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 295eb5ecaee5..e397990f6eb8 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -62,7 +62,7 @@ struct mr6_table {
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc6_unres_queue;
 	struct list_head	mfc6_cache_array[MFC6_LINES];
-	struct mif_device	vif6_table[MAXMIFS];
+	struct vif_device	vif6_table[MAXMIFS];
 	int			maxvif;
 	atomic_t		cache_resolve_queue_len;
 	bool			mroute_do_assert;
@@ -384,7 +384,7 @@ struct ipmr_vif_iter {
 	int ct;
 };
 
-static struct mif_device *ip6mr_vif_seq_idx(struct net *net,
+static struct vif_device *ip6mr_vif_seq_idx(struct net *net,
 					    struct ipmr_vif_iter *iter,
 					    loff_t pos)
 {
@@ -450,7 +450,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
 		seq_puts(seq,
 			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags\n");
 	} else {
-		const struct mif_device *vif = v;
+		const struct vif_device *vif = v;
 		const char *name = vif->dev ? vif->dev->name : "none";
 
 		seq_printf(seq,
@@ -776,7 +776,7 @@ failure:
 static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
 		       struct list_head *head)
 {
-	struct mif_device *v;
+	struct vif_device *v;
 	struct net_device *dev;
 	struct inet6_dev *in6_dev;
 
@@ -929,7 +929,7 @@ static int mif6_add(struct net *net, struct mr6_table *mrt,
 		    struct mif6ctl *vifc, int mrtsock)
 {
 	int vifi = vifc->mif6c_mifi;
-	struct mif_device *v = &mrt->vif6_table[vifi];
+	struct vif_device *v = &mrt->vif6_table[vifi];
 	struct net_device *dev;
 	struct inet6_dev *in6_dev;
 	int err;
@@ -980,21 +980,10 @@ static int mif6_add(struct net *net, struct mr6_table *mrt,
 					     dev->ifindex, &in6_dev->cnf);
 	}
 
-	/*
-	 *	Fill in the VIF structures
-	 */
-	v->rate_limit = vifc->vifc_rate_limit;
-	v->flags = vifc->mif6c_flags;
-	if (!mrtsock)
-		v->flags |= VIFF_STATIC;
-	v->threshold = vifc->vifc_threshold;
-	v->bytes_in = 0;
-	v->bytes_out = 0;
-	v->pkt_in = 0;
-	v->pkt_out = 0;
-	v->link = dev->ifindex;
-	if (v->flags & MIFF_REGISTER)
-		v->link = dev_get_iflink(dev);
+	/* Fill in the VIF structures */
+	vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold,
+			vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0),
+			MIFF_REGISTER);
 
 	/* And finish update writing critical data */
 	write_lock_bh(&mrt_lock);
@@ -1332,7 +1321,7 @@ static int ip6mr_device_event(struct notifier_block *this,
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct net *net = dev_net(dev);
 	struct mr6_table *mrt;
-	struct mif_device *v;
+	struct vif_device *v;
 	int ct;
 
 	if (event != NETDEV_UNREGISTER)
@@ -1873,7 +1862,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 {
 	struct sioc_sg_req6 sr;
 	struct sioc_mif_req6 vr;
-	struct mif_device *vif;
+	struct vif_device *vif;
 	struct mfc6_cache *c;
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
@@ -1947,7 +1936,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 {
 	struct compat_sioc_sg_req6 sr;
 	struct compat_sioc_mif_req6 vr;
-	struct mif_device *vif;
+	struct vif_device *vif;
 	struct mfc6_cache *c;
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
@@ -2018,7 +2007,7 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
 			  struct sk_buff *skb, struct mfc6_cache *c, int vifi)
 {
 	struct ipv6hdr *ipv6h;
-	struct mif_device *vif = &mrt->vif6_table[vifi];
+	struct vif_device *vif = &mrt->vif6_table[vifi];
 	struct net_device *dev;
 	struct dst_entry *dst;
 	struct flowi6 fl6;
-- 
cgit v1.2.3


From 8571ab479a6e1ef46ead5ebee567e128a422767c Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:30 +0200
Subject: ip6mr: Make mroute_sk rcu-based

In ipmr the mr_table socket is handled under RCU. Introduce the same
for ip6mr.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c |  2 +-
 net/ipv6/ip6mr.c      | 45 +++++++++++++++++++++++++++------------------
 2 files changed, 28 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 997c7f19ad62..a6eb0e699b15 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -71,7 +71,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 
 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
-		    ((mroute6_socket(net, skb) &&
+		    ((mroute6_is_socket(net, skb) &&
 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 					 &ipv6_hdr(skb)->saddr))) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index e397990f6eb8..a0e297ddca6e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -58,7 +58,7 @@ struct mr6_table {
 	struct list_head	list;
 	possible_net_t		net;
 	u32			id;
-	struct sock		*mroute6_sk;
+	struct sock __rcu	*mroute6_sk;
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc6_unres_queue;
 	struct list_head	mfc6_cache_array[MFC6_LINES];
@@ -1121,6 +1121,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
 static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
 			      mifi_t mifi, int assert)
 {
+	struct sock *mroute6_sk;
 	struct sk_buff *skb;
 	struct mrt6msg *msg;
 	int ret;
@@ -1190,17 +1191,19 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
-	if (!mrt->mroute6_sk) {
+	rcu_read_lock();
+	mroute6_sk = rcu_dereference(mrt->mroute6_sk);
+	if (!mroute6_sk) {
+		rcu_read_unlock();
 		kfree_skb(skb);
 		return -EINVAL;
 	}
 
 	mrt6msg_netlink_event(mrt, skb);
 
-	/*
-	 *	Deliver to user space multicast routing algorithms
-	 */
-	ret = sock_queue_rcv_skb(mrt->mroute6_sk, skb);
+	/* Deliver to user space multicast routing algorithms */
+	ret = sock_queue_rcv_skb(mroute6_sk, skb);
+	rcu_read_unlock();
 	if (ret < 0) {
 		net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
 		kfree_skb(skb);
@@ -1584,11 +1587,11 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk)
 
 	rtnl_lock();
 	write_lock_bh(&mrt_lock);
-	if (likely(mrt->mroute6_sk == NULL)) {
-		mrt->mroute6_sk = sk;
-		net->ipv6.devconf_all->mc_forwarding++;
-	} else {
+	if (rtnl_dereference(mrt->mroute6_sk)) {
 		err = -EADDRINUSE;
+	} else {
+		rcu_assign_pointer(mrt->mroute6_sk, sk);
+		net->ipv6.devconf_all->mc_forwarding++;
 	}
 	write_unlock_bh(&mrt_lock);
 
@@ -1614,9 +1617,9 @@ int ip6mr_sk_done(struct sock *sk)
 
 	rtnl_lock();
 	ip6mr_for_each_table(mrt, net) {
-		if (sk == mrt->mroute6_sk) {
+		if (sk == rtnl_dereference(mrt->mroute6_sk)) {
 			write_lock_bh(&mrt_lock);
-			mrt->mroute6_sk = NULL;
+			RCU_INIT_POINTER(mrt->mroute6_sk, NULL);
 			net->ipv6.devconf_all->mc_forwarding--;
 			write_unlock_bh(&mrt_lock);
 			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1630,11 +1633,12 @@ int ip6mr_sk_done(struct sock *sk)
 		}
 	}
 	rtnl_unlock();
+	synchronize_rcu();
 
 	return err;
 }
 
-struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
+bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
 {
 	struct mr6_table *mrt;
 	struct flowi6 fl6 = {
@@ -1646,8 +1650,9 @@ struct sock *mroute6_socket(struct net *net, struct sk_buff *skb)
 	if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
 		return NULL;
 
-	return mrt->mroute6_sk;
+	return rcu_access_pointer(mrt->mroute6_sk);
 }
+EXPORT_SYMBOL(mroute6_is_socket);
 
 /*
  *	Socket options and virtual interface manipulation. The whole
@@ -1674,7 +1679,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		return -ENOENT;
 
 	if (optname != MRT6_INIT) {
-		if (sk != mrt->mroute6_sk && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+		if (sk != rcu_access_pointer(mrt->mroute6_sk) &&
+		    !ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
@@ -1696,7 +1702,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		if (vif.mif6c_mifi >= MAXMIFS)
 			return -ENFILE;
 		rtnl_lock();
-		ret = mif6_add(net, mrt, &vif, sk == mrt->mroute6_sk);
+		ret = mif6_add(net, mrt, &vif,
+			       sk == rtnl_dereference(mrt->mroute6_sk));
 		rtnl_unlock();
 		return ret;
 
@@ -1731,7 +1738,9 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 			ret = ip6mr_mfc_delete(mrt, &mfc, parent);
 		else
 			ret = ip6mr_mfc_add(net, mrt, &mfc,
-					    sk == mrt->mroute6_sk, parent);
+					    sk ==
+					    rtnl_dereference(mrt->mroute6_sk),
+					    parent);
 		rtnl_unlock();
 		return ret;
 
@@ -1783,7 +1792,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		/* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */
 		if (v != RT_TABLE_DEFAULT && v >= 100000000)
 			return -EINVAL;
-		if (sk == mrt->mroute6_sk)
+		if (sk == rcu_access_pointer(mrt->mroute6_sk))
 			return -EBUSY;
 
 		rtnl_lock();
-- 
cgit v1.2.3


From 87c418bf1323d57140f4b448715f64de3fbb7e91 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:31 +0200
Subject: ip6mr: Align hash implementation to ipmr

Since commit 8fb472c09b9d ("ipmr: improve hash scalability") ipmr has
been using rhashtable as a basis for its mfc routes, but ip6mr is
currently still using the old private MFC hash implementation.

Align ip6mr to the current ipmr implementation.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 313 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 168 insertions(+), 145 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a0e297ddca6e..6f0b7f4894b2 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -61,8 +61,9 @@ struct mr6_table {
 	struct sock __rcu	*mroute6_sk;
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc6_unres_queue;
-	struct list_head	mfc6_cache_array[MFC6_LINES];
 	struct vif_device	vif6_table[MAXMIFS];
+	struct rhltable		mfc6_hash;
+	struct list_head	mfc6_cache_list;
 	int			maxvif;
 	atomic_t		cache_resolve_queue_len;
 	bool			mroute_do_assert;
@@ -299,10 +300,29 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 }
 #endif
 
+static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
+			  const void *ptr)
+{
+	const struct mfc6_cache_cmp_arg *cmparg = arg->key;
+	struct mfc6_cache *c = (struct mfc6_cache *)ptr;
+
+	return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) ||
+	       !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin);
+}
+
+static const struct rhashtable_params ip6mr_rht_params = {
+	.head_offset = offsetof(struct mfc6_cache, mnode),
+	.key_offset = offsetof(struct mfc6_cache, cmparg),
+	.key_len = sizeof(struct mfc6_cache_cmp_arg),
+	.nelem_hint = 3,
+	.locks_mul = 1,
+	.obj_cmpfn = ip6mr_hash_cmp,
+	.automatic_shrinking = true,
+};
+
 static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 {
 	struct mr6_table *mrt;
-	unsigned int i;
 
 	mrt = ip6mr_get_table(net, id);
 	if (mrt)
@@ -314,10 +334,8 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 	mrt->id = id;
 	write_pnet(&mrt->net, net);
 
-	/* Forwarding cache */
-	for (i = 0; i < MFC6_LINES; i++)
-		INIT_LIST_HEAD(&mrt->mfc6_cache_array[i]);
-
+	rhltable_init(&mrt->mfc6_hash, &ip6mr_rht_params);
+	INIT_LIST_HEAD(&mrt->mfc6_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc6_unres_queue);
 
 	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
@@ -335,6 +353,7 @@ static void ip6mr_free_table(struct mr6_table *mrt)
 {
 	del_timer_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, true);
+	rhltable_destroy(&mrt->mfc6_hash);
 	kfree(mrt);
 }
 
@@ -344,7 +363,6 @@ struct ipmr_mfc_iter {
 	struct seq_net_private p;
 	struct mr6_table *mrt;
 	struct list_head *cache;
-	int ct;
 };
 
 
@@ -354,14 +372,12 @@ static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
 	struct mr6_table *mrt = it->mrt;
 	struct mfc6_cache *mfc;
 
-	read_lock(&mrt_lock);
-	for (it->ct = 0; it->ct < MFC6_LINES; it->ct++) {
-		it->cache = &mrt->mfc6_cache_array[it->ct];
-		list_for_each_entry(mfc, it->cache, list)
-			if (pos-- == 0)
-				return mfc;
-	}
-	read_unlock(&mrt_lock);
+	rcu_read_lock();
+	it->cache = &mrt->mfc6_cache_list;
+	list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list)
+		if (pos-- == 0)
+			return mfc;
+	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
 	it->cache = &mrt->mfc6_unres_queue;
@@ -517,19 +533,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (it->cache == &mrt->mfc6_unres_queue)
 		goto end_of_list;
 
-	BUG_ON(it->cache != &mrt->mfc6_cache_array[it->ct]);
-
-	while (++it->ct < MFC6_LINES) {
-		it->cache = &mrt->mfc6_cache_array[it->ct];
-		if (list_empty(it->cache))
-			continue;
-		return list_first_entry(it->cache, struct mfc6_cache, list);
-	}
-
 	/* exhausted cache_array, show unresolved */
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 	it->cache = &mrt->mfc6_unres_queue;
-	it->ct = 0;
 
 	spin_lock_bh(&mfc_unres_lock);
 	if (!list_empty(it->cache))
@@ -549,8 +555,8 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 
 	if (it->cache == &mrt->mfc6_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &mrt->mfc6_cache_array[it->ct])
-		read_unlock(&mrt_lock);
+	else if (it->cache == &mrt->mfc6_cache_list)
+		rcu_read_unlock();
 }
 
 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -827,11 +833,18 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
 	return 0;
 }
 
-static inline void ip6mr_cache_free(struct mfc6_cache *c)
+static inline void ip6mr_cache_free_rcu(struct rcu_head *head)
 {
+	struct mfc6_cache *c = container_of(head, struct mfc6_cache, rcu);
+
 	kmem_cache_free(mrt_cachep, c);
 }
 
+static inline void ip6mr_cache_free(struct mfc6_cache *c)
+{
+	call_rcu(&c->rcu, ip6mr_cache_free_rcu);
+}
+
 /* Destroy an unresolved cache entry, killing queued skbs
    and reporting error to netlink readers.
  */
@@ -1002,14 +1015,17 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
 					   const struct in6_addr *origin,
 					   const struct in6_addr *mcastgrp)
 {
-	int line = MFC6_HASH(mcastgrp, origin);
+	struct mfc6_cache_cmp_arg arg = {
+		.mf6c_origin = *origin,
+		.mf6c_mcastgrp = *mcastgrp,
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc6_cache *c;
 
-	list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) {
-		if (ipv6_addr_equal(&c->mf6c_origin, origin) &&
-		    ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp))
-			return c;
-	}
+	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		return c;
+
 	return NULL;
 }
 
@@ -1017,13 +1033,16 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
 static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt,
 						      mifi_t mifi)
 {
-	int line = MFC6_HASH(&in6addr_any, &in6addr_any);
+	struct mfc6_cache_cmp_arg arg = {
+		.mf6c_origin = in6addr_any,
+		.mf6c_mcastgrp = in6addr_any,
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc6_cache *c;
 
-	list_for_each_entry(c, &mrt->mfc6_cache_array[line], list)
-		if (ipv6_addr_any(&c->mf6c_origin) &&
-		    ipv6_addr_any(&c->mf6c_mcastgrp) &&
-		    (c->mfc_un.res.ttls[mifi] < 255))
+	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (c->mfc_un.res.ttls[mifi] < 255)
 			return c;
 
 	return NULL;
@@ -1034,29 +1053,53 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt,
 					       struct in6_addr *mcastgrp,
 					       mifi_t mifi)
 {
-	int line = MFC6_HASH(mcastgrp, &in6addr_any);
+	struct mfc6_cache_cmp_arg arg = {
+		.mf6c_origin = in6addr_any,
+		.mf6c_mcastgrp = *mcastgrp,
+	};
+	struct rhlist_head *tmp, *list;
 	struct mfc6_cache *c, *proxy;
 
 	if (ipv6_addr_any(mcastgrp))
 		goto skip;
 
-	list_for_each_entry(c, &mrt->mfc6_cache_array[line], list)
-		if (ipv6_addr_any(&c->mf6c_origin) &&
-		    ipv6_addr_equal(&c->mf6c_mcastgrp, mcastgrp)) {
-			if (c->mfc_un.res.ttls[mifi] < 255)
-				return c;
-
-			/* It's ok if the mifi is part of the static tree */
-			proxy = ip6mr_cache_find_any_parent(mrt,
-							    c->mf6c_parent);
-			if (proxy && proxy->mfc_un.res.ttls[mifi] < 255)
-				return c;
-		}
+	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+		if (c->mfc_un.res.ttls[mifi] < 255)
+			return c;
+
+		/* It's ok if the mifi is part of the static tree */
+		proxy = ip6mr_cache_find_any_parent(mrt, c->mf6c_parent);
+		if (proxy && proxy->mfc_un.res.ttls[mifi] < 255)
+			return c;
+	}
 
 skip:
 	return ip6mr_cache_find_any_parent(mrt, mifi);
 }
 
+/* Look for a (S,G,iif) entry if parent != -1 */
+static struct mfc6_cache *
+ip6mr_cache_find_parent(struct mr6_table *mrt,
+			const struct in6_addr *origin,
+			const struct in6_addr *mcastgrp,
+			int parent)
+{
+	struct mfc6_cache_cmp_arg arg = {
+		.mf6c_origin = *origin,
+		.mf6c_mcastgrp = *mcastgrp,
+	};
+	struct rhlist_head *tmp, *list;
+	struct mfc6_cache *c;
+
+	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (parent == -1 || parent == c->mf6c_parent)
+			return c;
+
+	return NULL;
+}
+
 /*
  *	Allocate a multicast cache entry
  */
@@ -1296,26 +1339,21 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb)
 static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc,
 			    int parent)
 {
-	int line;
-	struct mfc6_cache *c, *next;
-
-	line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
+	struct mfc6_cache *c;
 
-	list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[line], list) {
-		if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
-		    ipv6_addr_equal(&c->mf6c_mcastgrp,
-				    &mfc->mf6cc_mcastgrp.sin6_addr) &&
-		    (parent == -1 || parent == c->mf6c_parent)) {
-			write_lock_bh(&mrt_lock);
-			list_del(&c->list);
-			write_unlock_bh(&mrt_lock);
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr,
+				    &mfc->mf6cc_mcastgrp.sin6_addr, parent);
+	rcu_read_unlock();
+	if (!c)
+		return -ENOENT;
+	rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params);
+	list_del_rcu(&c->list);
 
-			mr6_netlink_event(mrt, c, RTM_DELROUTE);
-			ip6mr_cache_free(c);
-			return 0;
-		}
-	}
-	return -ENOENT;
+	mr6_netlink_event(mrt, c, RTM_DELROUTE);
+	ip6mr_cache_free(c);
+	return 0;
 }
 
 static int ip6mr_device_event(struct notifier_block *this,
@@ -1448,11 +1486,10 @@ void ip6_mr_cleanup(void)
 static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
 			 struct mf6cctl *mfc, int mrtsock, int parent)
 {
-	bool found = false;
-	int line;
-	struct mfc6_cache *uc, *c;
 	unsigned char ttls[MAXMIFS];
-	int i;
+	struct mfc6_cache *uc, *c;
+	bool found;
+	int i, err;
 
 	if (mfc->mf6cc_parent >= MAXMIFS)
 		return -ENFILE;
@@ -1461,22 +1498,14 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
 	for (i = 0; i < MAXMIFS; i++) {
 		if (IF_ISSET(i, &mfc->mf6cc_ifset))
 			ttls[i] = 1;
-
-	}
-
-	line = MFC6_HASH(&mfc->mf6cc_mcastgrp.sin6_addr, &mfc->mf6cc_origin.sin6_addr);
-
-	list_for_each_entry(c, &mrt->mfc6_cache_array[line], list) {
-		if (ipv6_addr_equal(&c->mf6c_origin, &mfc->mf6cc_origin.sin6_addr) &&
-		    ipv6_addr_equal(&c->mf6c_mcastgrp,
-				    &mfc->mf6cc_mcastgrp.sin6_addr) &&
-		    (parent == -1 || parent == mfc->mf6cc_parent)) {
-			found = true;
-			break;
-		}
 	}
 
-	if (found) {
+	/* The entries are added/deleted only under RTNL */
+	rcu_read_lock();
+	c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr,
+				    &mfc->mf6cc_mcastgrp.sin6_addr, parent);
+	rcu_read_unlock();
+	if (c) {
 		write_lock_bh(&mrt_lock);
 		c->mf6c_parent = mfc->mf6cc_parent;
 		ip6mr_update_thresholds(mrt, c, ttls);
@@ -1502,13 +1531,17 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-	write_lock_bh(&mrt_lock);
-	list_add(&c->list, &mrt->mfc6_cache_array[line]);
-	write_unlock_bh(&mrt_lock);
+	err = rhltable_insert_key(&mrt->mfc6_hash, &c->cmparg, &c->mnode,
+				  ip6mr_rht_params);
+	if (err) {
+		pr_err("ip6mr: rhtable insert error %d\n", err);
+		ip6mr_cache_free(c);
+		return err;
+	}
+	list_add_tail_rcu(&c->list, &mrt->mfc6_cache_list);
 
-	/*
-	 *	Check to see if we resolved a queued list. If so we
-	 *	need to send on the frames and tidy up.
+	/* Check to see if we resolved a queued list. If so we
+	 * need to send on the frames and tidy up.
 	 */
 	found = false;
 	spin_lock_bh(&mfc_unres_lock);
@@ -1539,13 +1572,11 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
 
 static void mroute_clean_tables(struct mr6_table *mrt, bool all)
 {
-	int i;
+	struct mfc6_cache *c, *tmp;
 	LIST_HEAD(list);
-	struct mfc6_cache *c, *next;
+	int i;
 
-	/*
-	 *	Shut down all active vif entries
-	 */
+	/* Shut down all active vif entries */
 	for (i = 0; i < mrt->maxvif; i++) {
 		if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
 			continue;
@@ -1553,25 +1584,19 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
 	}
 	unregister_netdevice_many(&list);
 
-	/*
-	 *	Wipe the cache
-	 */
-	for (i = 0; i < MFC6_LINES; i++) {
-		list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) {
-			if (!all && (c->mfc_flags & MFC_STATIC))
-				continue;
-			write_lock_bh(&mrt_lock);
-			list_del(&c->list);
-			write_unlock_bh(&mrt_lock);
-
-			mr6_netlink_event(mrt, c, RTM_DELROUTE);
-			ip6mr_cache_free(c);
-		}
+	/* Wipe the cache */
+	list_for_each_entry_safe(c, tmp, &mrt->mfc6_cache_list, list) {
+		if (!all && (c->mfc_flags & MFC_STATIC))
+			continue;
+		rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params);
+		list_del_rcu(&c->list);
+		mr6_netlink_event(mrt, c, RTM_DELROUTE);
+		ip6mr_cache_free(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc6_unres_queue, list) {
 			list_del(&c->list);
 			mr6_netlink_event(mrt, c, RTM_DELROUTE);
 			ip6mr_destroy_unres(mrt, c);
@@ -1905,19 +1930,19 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 		if (copy_from_user(&sr, arg, sizeof(sr)))
 			return -EFAULT;
 
-		read_lock(&mrt_lock);
+		rcu_read_lock();
 		c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
 		if (c) {
 			sr.pktcnt = c->mfc_un.res.pkt;
 			sr.bytecnt = c->mfc_un.res.bytes;
 			sr.wrong_if = c->mfc_un.res.wrong_if;
-			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
 				return -EFAULT;
 			return 0;
 		}
-		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 		return -EADDRNOTAVAIL;
 	default:
 		return -ENOIOCTLCMD;
@@ -1979,19 +2004,19 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 		if (copy_from_user(&sr, arg, sizeof(sr)))
 			return -EFAULT;
 
-		read_lock(&mrt_lock);
+		rcu_read_lock();
 		c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
 		if (c) {
 			sr.pktcnt = c->mfc_un.res.pkt;
 			sr.bytecnt = c->mfc_un.res.bytes;
 			sr.wrong_if = c->mfc_un.res.wrong_if;
-			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
 				return -EFAULT;
 			return 0;
 		}
-		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 		return -EADDRNOTAVAIL;
 	default:
 		return -ENOIOCTLCMD;
@@ -2115,10 +2140,14 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
 		/* For an (*,G) entry, we only check that the incoming
 		 * interface is part of the static tree.
 		 */
+		rcu_read_lock();
 		cache_proxy = ip6mr_cache_find_any_parent(mrt, vif);
 		if (cache_proxy &&
-		    cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+		    cache_proxy->mfc_un.res.ttls[true_vifi] < 255) {
+			rcu_read_unlock();
 			goto forward;
+		}
+		rcu_read_unlock();
 	}
 
 	/*
@@ -2535,34 +2564,30 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	struct mr6_table *mrt;
 	struct mfc6_cache *mfc;
 	unsigned int t = 0, s_t;
-	unsigned int h = 0, s_h;
 	unsigned int e = 0, s_e;
 
 	s_t = cb->args[0];
-	s_h = cb->args[1];
-	s_e = cb->args[2];
+	s_e = cb->args[1];
 
-	read_lock(&mrt_lock);
+	rcu_read_lock();
 	ip6mr_for_each_table(mrt, net) {
 		if (t < s_t)
 			goto next_table;
-		if (t > s_t)
-			s_h = 0;
-		for (h = s_h; h < MFC6_LINES; h++) {
-			list_for_each_entry(mfc, &mrt->mfc6_cache_array[h], list) {
-				if (e < s_e)
-					goto next_entry;
-				if (ip6mr_fill_mroute(mrt, skb,
-						      NETLINK_CB(cb->skb).portid,
-						      cb->nlh->nlmsg_seq,
-						      mfc, RTM_NEWROUTE,
-						      NLM_F_MULTI) < 0)
-					goto done;
+		list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list) {
+			if (e < s_e)
+				goto next_entry;
+			if (ip6mr_fill_mroute(mrt, skb,
+					      NETLINK_CB(cb->skb).portid,
+					      cb->nlh->nlmsg_seq,
+					      mfc, RTM_NEWROUTE,
+					      NLM_F_MULTI) < 0)
+				goto done;
 next_entry:
-				e++;
-			}
-			e = s_e = 0;
+			e++;
 		}
+		e = 0;
+		s_e = 0;
+
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) {
 			if (e < s_e)
@@ -2580,15 +2605,13 @@ next_entry2:
 		}
 		spin_unlock_bh(&mfc_unres_lock);
 		e = s_e = 0;
-		s_h = 0;
 next_table:
 		t++;
 	}
 done:
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 
-	cb->args[2] = e;
-	cb->args[1] = h;
+	cb->args[1] = e;
 	cb->args[0] = t;
 
 	return skb->len;
-- 
cgit v1.2.3


From b70432f7319eb75b24ca57dde8146c5e27244780 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:32 +0200
Subject: mroute*: Make mr_table a common struct

Following previous changes to ip6mr, mr_table and mr6_table are
basically the same [up to mr6_table having additional '6' suffixes to
its variable names].
Move the common structure definition into a common header; This
requires renaming all references in ip6mr to variables that had the
distinct suffix.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c  |   2 -
 net/ipv6/ip6mr.c | 301 +++++++++++++++++++++++++------------------------------
 2 files changed, 139 insertions(+), 164 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1370edad64bf..a1bf0020cad1 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -31,7 +31,6 @@
 #include <linux/cache.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/fcntl.h>
@@ -53,7 +52,6 @@
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <net/route.h>
-#include <net/sock.h>
 #include <net/icmp.h>
 #include <net/udp.h>
 #include <net/raw.h>
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 6f0b7f4894b2..adbb826ca8fd 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -20,7 +20,6 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
-#include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/kernel.h>
 #include <linux/fcntl.h>
@@ -36,7 +35,6 @@
 #include <linux/compat.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
-#include <net/sock.h>
 #include <net/raw.h>
 #include <linux/notifier.h>
 #include <linux/if_arp.h>
@@ -54,31 +52,12 @@
 #include <net/ip6_checksum.h>
 #include <linux/netconf.h>
 
-struct mr6_table {
-	struct list_head	list;
-	possible_net_t		net;
-	u32			id;
-	struct sock __rcu	*mroute6_sk;
-	struct timer_list	ipmr_expire_timer;
-	struct list_head	mfc6_unres_queue;
-	struct vif_device	vif6_table[MAXMIFS];
-	struct rhltable		mfc6_hash;
-	struct list_head	mfc6_cache_list;
-	int			maxvif;
-	atomic_t		cache_resolve_queue_len;
-	bool			mroute_do_assert;
-	bool			mroute_do_pim;
-#ifdef CONFIG_IPV6_PIMSM_V2
-	int			mroute_reg_vif_num;
-#endif
-};
-
 struct ip6mr_rule {
 	struct fib_rule		common;
 };
 
 struct ip6mr_result {
-	struct mr6_table	*mrt;
+	struct mr_table	*mrt;
 };
 
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
@@ -87,11 +66,7 @@ struct ip6mr_result {
 
 static DEFINE_RWLOCK(mrt_lock);
 
-/*
- *	Multicast router control variables
- */
-
-#define MIF_EXISTS(_mrt, _idx) ((_mrt)->vif6_table[_idx].dev != NULL)
+/* Multicast router control variables */
 
 /* Special spinlock for queue of unresolved entries */
 static DEFINE_SPINLOCK(mfc_unres_lock);
@@ -106,30 +81,30 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
 
 static struct kmem_cache *mrt_cachep __read_mostly;
 
-static struct mr6_table *ip6mr_new_table(struct net *net, u32 id);
-static void ip6mr_free_table(struct mr6_table *mrt);
+static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
+static void ip6mr_free_table(struct mr_table *mrt);
 
-static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 			   struct sk_buff *skb, struct mfc6_cache *cache);
-static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
+static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 			      mifi_t mifi, int assert);
-static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			       struct mfc6_cache *c, struct rtmsg *rtm);
-static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
+static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
 			      int cmd);
-static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt);
+static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
 static int ip6mr_rtm_dumproute(struct sk_buff *skb,
 			       struct netlink_callback *cb);
-static void mroute_clean_tables(struct mr6_table *mrt, bool all);
+static void mroute_clean_tables(struct mr_table *mrt, bool all);
 static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
 #define ip6mr_for_each_table(mrt, net) \
 	list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
 
-static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
+static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
 {
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	ip6mr_for_each_table(mrt, net) {
 		if (mrt->id == id)
@@ -139,7 +114,7 @@ static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
 }
 
 static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
-			    struct mr6_table **mrt)
+			    struct mr_table **mrt)
 {
 	int err;
 	struct ip6mr_result res;
@@ -160,7 +135,7 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
 			     int flags, struct fib_lookup_arg *arg)
 {
 	struct ip6mr_result *res = arg->result;
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	switch (rule->action) {
 	case FR_ACT_TO_TBL:
@@ -228,7 +203,7 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
 static int __net_init ip6mr_rules_init(struct net *net)
 {
 	struct fib_rules_ops *ops;
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	int err;
 
 	ops = fib_rules_register(&ip6mr_rules_ops_template, net);
@@ -259,7 +234,7 @@ err1:
 
 static void __net_exit ip6mr_rules_exit(struct net *net)
 {
-	struct mr6_table *mrt, *next;
+	struct mr_table *mrt, *next;
 
 	rtnl_lock();
 	list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
@@ -273,13 +248,13 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 #define ip6mr_for_each_table(mrt, net) \
 	for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
 
-static struct mr6_table *ip6mr_get_table(struct net *net, u32 id)
+static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
 {
 	return net->ipv6.mrt6;
 }
 
 static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
-			    struct mr6_table **mrt)
+			    struct mr_table **mrt)
 {
 	*mrt = net->ipv6.mrt6;
 	return 0;
@@ -320,9 +295,9 @@ static const struct rhashtable_params ip6mr_rht_params = {
 	.automatic_shrinking = true,
 };
 
-static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
+static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 {
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	mrt = ip6mr_get_table(net, id);
 	if (mrt)
@@ -334,9 +309,9 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 	mrt->id = id;
 	write_pnet(&mrt->net, net);
 
-	rhltable_init(&mrt->mfc6_hash, &ip6mr_rht_params);
-	INIT_LIST_HEAD(&mrt->mfc6_cache_list);
-	INIT_LIST_HEAD(&mrt->mfc6_unres_queue);
+	rhltable_init(&mrt->mfc_hash, &ip6mr_rht_params);
+	INIT_LIST_HEAD(&mrt->mfc_cache_list);
+	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
 	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
 
@@ -349,11 +324,11 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 	return mrt;
 }
 
-static void ip6mr_free_table(struct mr6_table *mrt)
+static void ip6mr_free_table(struct mr_table *mrt)
 {
 	del_timer_sync(&mrt->ipmr_expire_timer);
 	mroute_clean_tables(mrt, true);
-	rhltable_destroy(&mrt->mfc6_hash);
+	rhltable_destroy(&mrt->mfc_hash);
 	kfree(mrt);
 }
 
@@ -361,7 +336,7 @@ static void ip6mr_free_table(struct mr6_table *mrt)
 
 struct ipmr_mfc_iter {
 	struct seq_net_private p;
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct list_head *cache;
 };
 
@@ -369,18 +344,18 @@ struct ipmr_mfc_iter {
 static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
 					   struct ipmr_mfc_iter *it, loff_t pos)
 {
-	struct mr6_table *mrt = it->mrt;
+	struct mr_table *mrt = it->mrt;
 	struct mfc6_cache *mfc;
 
 	rcu_read_lock();
-	it->cache = &mrt->mfc6_cache_list;
-	list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list)
+	it->cache = &mrt->mfc_cache_list;
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
 		if (pos-- == 0)
 			return mfc;
 	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
-	it->cache = &mrt->mfc6_unres_queue;
+	it->cache = &mrt->mfc_unres_queue;
 	list_for_each_entry(mfc, it->cache, list)
 		if (pos-- == 0)
 			return mfc;
@@ -396,7 +371,7 @@ static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
 
 struct ipmr_vif_iter {
 	struct seq_net_private p;
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	int ct;
 };
 
@@ -404,13 +379,13 @@ static struct vif_device *ip6mr_vif_seq_idx(struct net *net,
 					    struct ipmr_vif_iter *iter,
 					    loff_t pos)
 {
-	struct mr6_table *mrt = iter->mrt;
+	struct mr_table *mrt = iter->mrt;
 
 	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
-		if (!MIF_EXISTS(mrt, iter->ct))
+		if (!VIF_EXISTS(mrt, iter->ct))
 			continue;
 		if (pos-- == 0)
-			return &mrt->vif6_table[iter->ct];
+			return &mrt->vif_table[iter->ct];
 	}
 	return NULL;
 }
@@ -420,7 +395,7 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct ipmr_vif_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
 	if (!mrt)
@@ -437,16 +412,16 @@ static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct ipmr_vif_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct mr6_table *mrt = iter->mrt;
+	struct mr_table *mrt = iter->mrt;
 
 	++*pos;
 	if (v == SEQ_START_TOKEN)
 		return ip6mr_vif_seq_idx(net, iter, 0);
 
 	while (++iter->ct < mrt->maxvif) {
-		if (!MIF_EXISTS(mrt, iter->ct))
+		if (!VIF_EXISTS(mrt, iter->ct))
 			continue;
-		return &mrt->vif6_table[iter->ct];
+		return &mrt->vif_table[iter->ct];
 	}
 	return NULL;
 }
@@ -460,7 +435,7 @@ static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
 static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
 {
 	struct ipmr_vif_iter *iter = seq->private;
-	struct mr6_table *mrt = iter->mrt;
+	struct mr_table *mrt = iter->mrt;
 
 	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq,
@@ -471,7 +446,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
 
 		seq_printf(seq,
 			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X\n",
-			   vif - mrt->vif6_table,
+			   vif - mrt->vif_table,
 			   name, vif->bytes_in, vif->pkt_in,
 			   vif->bytes_out, vif->pkt_out,
 			   vif->flags);
@@ -503,7 +478,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
 	if (!mrt)
@@ -520,7 +495,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	struct mfc6_cache *mfc = v;
 	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct mr6_table *mrt = it->mrt;
+	struct mr_table *mrt = it->mrt;
 
 	++*pos;
 
@@ -530,12 +505,12 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (mfc->list.next != it->cache)
 		return list_entry(mfc->list.next, struct mfc6_cache, list);
 
-	if (it->cache == &mrt->mfc6_unres_queue)
+	if (it->cache == &mrt->mfc_unres_queue)
 		goto end_of_list;
 
 	/* exhausted cache_array, show unresolved */
 	rcu_read_unlock();
-	it->cache = &mrt->mfc6_unres_queue;
+	it->cache = &mrt->mfc_unres_queue;
 
 	spin_lock_bh(&mfc_unres_lock);
 	if (!list_empty(it->cache))
@@ -551,11 +526,11 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 {
 	struct ipmr_mfc_iter *it = seq->private;
-	struct mr6_table *mrt = it->mrt;
+	struct mr_table *mrt = it->mrt;
 
-	if (it->cache == &mrt->mfc6_unres_queue)
+	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &mrt->mfc6_cache_list)
+	else if (it->cache == &mrt->mfc_cache_list)
 		rcu_read_unlock();
 }
 
@@ -571,20 +546,20 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 	} else {
 		const struct mfc6_cache *mfc = v;
 		const struct ipmr_mfc_iter *it = seq->private;
-		struct mr6_table *mrt = it->mrt;
+		struct mr_table *mrt = it->mrt;
 
 		seq_printf(seq, "%pI6 %pI6 %-3hd",
 			   &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
 			   mfc->mf6c_parent);
 
-		if (it->cache != &mrt->mfc6_unres_queue) {
+		if (it->cache != &mrt->mfc_unres_queue) {
 			seq_printf(seq, " %8lu %8lu %8lu",
 				   mfc->mfc_un.res.pkt,
 				   mfc->mfc_un.res.bytes,
 				   mfc->mfc_un.res.wrong_if);
 			for (n = mfc->mfc_un.res.minvif;
 			     n < mfc->mfc_un.res.maxvif; n++) {
-				if (MIF_EXISTS(mrt, n) &&
+				if (VIF_EXISTS(mrt, n) &&
 				    mfc->mfc_un.res.ttls[n] < 255)
 					seq_printf(seq,
 						   " %2d:%-3d",
@@ -630,7 +605,7 @@ static int pim6_rcv(struct sk_buff *skb)
 	struct ipv6hdr   *encap;
 	struct net_device  *reg_dev = NULL;
 	struct net *net = dev_net(skb->dev);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct flowi6 fl6 = {
 		.flowi6_iif	= skb->dev->ifindex,
 		.flowi6_mark	= skb->mark,
@@ -664,7 +639,7 @@ static int pim6_rcv(struct sk_buff *skb)
 
 	read_lock(&mrt_lock);
 	if (reg_vif_num >= 0)
-		reg_dev = mrt->vif6_table[reg_vif_num].dev;
+		reg_dev = mrt->vif_table[reg_vif_num].dev;
 	if (reg_dev)
 		dev_hold(reg_dev);
 	read_unlock(&mrt_lock);
@@ -699,7 +674,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
 				      struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct flowi6 fl6 = {
 		.flowi6_oif	= dev->ifindex,
 		.flowi6_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
@@ -742,7 +717,7 @@ static void reg_vif_setup(struct net_device *dev)
 	dev->features		|= NETIF_F_NETNS_LOCAL;
 }
 
-static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt)
+static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
 {
 	struct net_device *dev;
 	char name[IFNAMSIZ];
@@ -779,7 +754,7 @@ failure:
  *	Delete a VIF entry
  */
 
-static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
+static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 		       struct list_head *head)
 {
 	struct vif_device *v;
@@ -789,7 +764,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
 	if (vifi < 0 || vifi >= mrt->maxvif)
 		return -EADDRNOTAVAIL;
 
-	v = &mrt->vif6_table[vifi];
+	v = &mrt->vif_table[vifi];
 
 	write_lock_bh(&mrt_lock);
 	dev = v->dev;
@@ -808,7 +783,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
 	if (vifi + 1 == mrt->maxvif) {
 		int tmp;
 		for (tmp = vifi - 1; tmp >= 0; tmp--) {
-			if (MIF_EXISTS(mrt, tmp))
+			if (VIF_EXISTS(mrt, tmp))
 				break;
 		}
 		mrt->maxvif = tmp + 1;
@@ -849,7 +824,7 @@ static inline void ip6mr_cache_free(struct mfc6_cache *c)
    and reporting error to netlink readers.
  */
 
-static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c)
+static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c)
 {
 	struct net *net = read_pnet(&mrt->net);
 	struct sk_buff *skb;
@@ -875,13 +850,13 @@ static void ip6mr_destroy_unres(struct mr6_table *mrt, struct mfc6_cache *c)
 
 /* Timer process for all the unresolved queue. */
 
-static void ipmr_do_expire_process(struct mr6_table *mrt)
+static void ipmr_do_expire_process(struct mr_table *mrt)
 {
 	unsigned long now = jiffies;
 	unsigned long expires = 10 * HZ;
 	struct mfc6_cache *c, *next;
 
-	list_for_each_entry_safe(c, next, &mrt->mfc6_unres_queue, list) {
+	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
 		if (time_after(c->mfc_un.unres.expires, now)) {
 			/* not yet... */
 			unsigned long interval = c->mfc_un.unres.expires - now;
@@ -895,20 +870,20 @@ static void ipmr_do_expire_process(struct mr6_table *mrt)
 		ip6mr_destroy_unres(mrt, c);
 	}
 
-	if (!list_empty(&mrt->mfc6_unres_queue))
+	if (!list_empty(&mrt->mfc_unres_queue))
 		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
 }
 
 static void ipmr_expire_process(struct timer_list *t)
 {
-	struct mr6_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
+	struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
 
 	if (!spin_trylock(&mfc_unres_lock)) {
 		mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
 		return;
 	}
 
-	if (!list_empty(&mrt->mfc6_unres_queue))
+	if (!list_empty(&mrt->mfc_unres_queue))
 		ipmr_do_expire_process(mrt);
 
 	spin_unlock(&mfc_unres_lock);
@@ -916,7 +891,8 @@ static void ipmr_expire_process(struct timer_list *t)
 
 /* Fill oifs list. It is called under write locked mrt_lock. */
 
-static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *cache,
+static void ip6mr_update_thresholds(struct mr_table *mrt,
+				    struct mfc6_cache *cache,
 				    unsigned char *ttls)
 {
 	int vifi;
@@ -926,7 +902,7 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca
 	memset(cache->mfc_un.res.ttls, 255, MAXMIFS);
 
 	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
-		if (MIF_EXISTS(mrt, vifi) &&
+		if (VIF_EXISTS(mrt, vifi) &&
 		    ttls[vifi] && ttls[vifi] < 255) {
 			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
 			if (cache->mfc_un.res.minvif > vifi)
@@ -938,17 +914,17 @@ static void ip6mr_update_thresholds(struct mr6_table *mrt, struct mfc6_cache *ca
 	cache->mfc_un.res.lastuse = jiffies;
 }
 
-static int mif6_add(struct net *net, struct mr6_table *mrt,
+static int mif6_add(struct net *net, struct mr_table *mrt,
 		    struct mif6ctl *vifc, int mrtsock)
 {
 	int vifi = vifc->mif6c_mifi;
-	struct vif_device *v = &mrt->vif6_table[vifi];
+	struct vif_device *v = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct inet6_dev *in6_dev;
 	int err;
 
 	/* Is vif busy ? */
-	if (MIF_EXISTS(mrt, vifi))
+	if (VIF_EXISTS(mrt, vifi))
 		return -EADDRINUSE;
 
 	switch (vifc->mif6c_flags) {
@@ -1011,7 +987,7 @@ static int mif6_add(struct net *net, struct mr6_table *mrt,
 	return 0;
 }
 
-static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
+static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt,
 					   const struct in6_addr *origin,
 					   const struct in6_addr *mcastgrp)
 {
@@ -1022,7 +998,7 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
 	struct rhlist_head *tmp, *list;
 	struct mfc6_cache *c;
 
-	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
 		return c;
 
@@ -1030,7 +1006,7 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr6_table *mrt,
 }
 
 /* Look for a (*,*,oif) entry */
-static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt,
+static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr_table *mrt,
 						      mifi_t mifi)
 {
 	struct mfc6_cache_cmp_arg arg = {
@@ -1040,7 +1016,7 @@ static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt,
 	struct rhlist_head *tmp, *list;
 	struct mfc6_cache *c;
 
-	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
 		if (c->mfc_un.res.ttls[mifi] < 255)
 			return c;
@@ -1049,7 +1025,7 @@ static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr6_table *mrt,
 }
 
 /* Look for a (*,G) entry */
-static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt,
+static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt,
 					       struct in6_addr *mcastgrp,
 					       mifi_t mifi)
 {
@@ -1063,7 +1039,7 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr6_table *mrt,
 	if (ipv6_addr_any(mcastgrp))
 		goto skip;
 
-	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
 		if (c->mfc_un.res.ttls[mifi] < 255)
 			return c;
@@ -1080,7 +1056,7 @@ skip:
 
 /* Look for a (S,G,iif) entry if parent != -1 */
 static struct mfc6_cache *
-ip6mr_cache_find_parent(struct mr6_table *mrt,
+ip6mr_cache_find_parent(struct mr_table *mrt,
 			const struct in6_addr *origin,
 			const struct in6_addr *mcastgrp,
 			int parent)
@@ -1092,7 +1068,7 @@ ip6mr_cache_find_parent(struct mr6_table *mrt,
 	struct rhlist_head *tmp, *list;
 	struct mfc6_cache *c;
 
-	list = rhltable_lookup(&mrt->mfc6_hash, &arg, ip6mr_rht_params);
+	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
 		if (parent == -1 || parent == c->mf6c_parent)
 			return c;
@@ -1127,7 +1103,7 @@ static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
  *	A cache entry has gone into a resolved state from queued
  */
 
-static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
+static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
 				struct mfc6_cache *uc, struct mfc6_cache *c)
 {
 	struct sk_buff *skb;
@@ -1161,7 +1137,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr6_table *mrt,
  *	Called under mrt_lock.
  */
 
-static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
+static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 			      mifi_t mifi, int assert)
 {
 	struct sock *mroute6_sk;
@@ -1235,7 +1211,7 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
 	}
 
 	rcu_read_lock();
-	mroute6_sk = rcu_dereference(mrt->mroute6_sk);
+	mroute6_sk = rcu_dereference(mrt->mroute_sk);
 	if (!mroute6_sk) {
 		rcu_read_unlock();
 		kfree_skb(skb);
@@ -1260,14 +1236,14 @@ static int ip6mr_cache_report(struct mr6_table *mrt, struct sk_buff *pkt,
  */
 
 static int
-ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb)
+ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb)
 {
 	bool found = false;
 	int err;
 	struct mfc6_cache *c;
 
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(c, &mrt->mfc6_unres_queue, list) {
+	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
 		if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
 		    ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) {
 			found = true;
@@ -1311,7 +1287,7 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb)
 		}
 
 		atomic_inc(&mrt->cache_resolve_queue_len);
-		list_add(&c->list, &mrt->mfc6_unres_queue);
+		list_add(&c->list, &mrt->mfc_unres_queue);
 		mr6_netlink_event(mrt, c, RTM_NEWROUTE);
 
 		ipmr_do_expire_process(mrt);
@@ -1336,7 +1312,7 @@ ip6mr_cache_unresolved(struct mr6_table *mrt, mifi_t mifi, struct sk_buff *skb)
  *	MFC6 cache manipulation by user space
  */
 
-static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc,
+static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc,
 			    int parent)
 {
 	struct mfc6_cache *c;
@@ -1348,7 +1324,7 @@ static int ip6mr_mfc_delete(struct mr6_table *mrt, struct mf6cctl *mfc,
 	rcu_read_unlock();
 	if (!c)
 		return -ENOENT;
-	rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params);
+	rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
 	list_del_rcu(&c->list);
 
 	mr6_netlink_event(mrt, c, RTM_DELROUTE);
@@ -1361,7 +1337,7 @@ static int ip6mr_device_event(struct notifier_block *this,
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct net *net = dev_net(dev);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct vif_device *v;
 	int ct;
 
@@ -1369,7 +1345,7 @@ static int ip6mr_device_event(struct notifier_block *this,
 		return NOTIFY_DONE;
 
 	ip6mr_for_each_table(mrt, net) {
-		v = &mrt->vif6_table[0];
+		v = &mrt->vif_table[0];
 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
 			if (v->dev == dev)
 				mif6_delete(mrt, ct, 1, NULL);
@@ -1483,7 +1459,7 @@ void ip6_mr_cleanup(void)
 	kmem_cache_destroy(mrt_cachep);
 }
 
-static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
+static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 			 struct mf6cctl *mfc, int mrtsock, int parent)
 {
 	unsigned char ttls[MAXMIFS];
@@ -1531,21 +1507,21 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-	err = rhltable_insert_key(&mrt->mfc6_hash, &c->cmparg, &c->mnode,
+	err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
 				  ip6mr_rht_params);
 	if (err) {
 		pr_err("ip6mr: rhtable insert error %d\n", err);
 		ip6mr_cache_free(c);
 		return err;
 	}
-	list_add_tail_rcu(&c->list, &mrt->mfc6_cache_list);
+	list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
 
 	/* Check to see if we resolved a queued list. If so we
 	 * need to send on the frames and tidy up.
 	 */
 	found = false;
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(uc, &mrt->mfc6_unres_queue, list) {
+	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
 		if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
 		    ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
 			list_del(&uc->list);
@@ -1554,7 +1530,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
 			break;
 		}
 	}
-	if (list_empty(&mrt->mfc6_unres_queue))
+	if (list_empty(&mrt->mfc_unres_queue))
 		del_timer(&mrt->ipmr_expire_timer);
 	spin_unlock_bh(&mfc_unres_lock);
 
@@ -1570,7 +1546,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
  *	Close the multicast socket, and clear the vif tables etc
  */
 
-static void mroute_clean_tables(struct mr6_table *mrt, bool all)
+static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
 	struct mfc6_cache *c, *tmp;
 	LIST_HEAD(list);
@@ -1578,17 +1554,17 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
 
 	/* Shut down all active vif entries */
 	for (i = 0; i < mrt->maxvif; i++) {
-		if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
+		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
 			continue;
 		mif6_delete(mrt, i, 0, &list);
 	}
 	unregister_netdevice_many(&list);
 
 	/* Wipe the cache */
-	list_for_each_entry_safe(c, tmp, &mrt->mfc6_cache_list, list) {
+	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
 		if (!all && (c->mfc_flags & MFC_STATIC))
 			continue;
-		rhltable_remove(&mrt->mfc6_hash, &c->mnode, ip6mr_rht_params);
+		rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
 		list_del_rcu(&c->list);
 		mr6_netlink_event(mrt, c, RTM_DELROUTE);
 		ip6mr_cache_free(c);
@@ -1596,7 +1572,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry_safe(c, tmp, &mrt->mfc6_unres_queue, list) {
+		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
 			mr6_netlink_event(mrt, c, RTM_DELROUTE);
 			ip6mr_destroy_unres(mrt, c);
@@ -1605,17 +1581,17 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all)
 	}
 }
 
-static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk)
+static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
 {
 	int err = 0;
 	struct net *net = sock_net(sk);
 
 	rtnl_lock();
 	write_lock_bh(&mrt_lock);
-	if (rtnl_dereference(mrt->mroute6_sk)) {
+	if (rtnl_dereference(mrt->mroute_sk)) {
 		err = -EADDRINUSE;
 	} else {
-		rcu_assign_pointer(mrt->mroute6_sk, sk);
+		rcu_assign_pointer(mrt->mroute_sk, sk);
 		net->ipv6.devconf_all->mc_forwarding++;
 	}
 	write_unlock_bh(&mrt_lock);
@@ -1634,7 +1610,7 @@ int ip6mr_sk_done(struct sock *sk)
 {
 	int err = -EACCES;
 	struct net *net = sock_net(sk);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	if (sk->sk_type != SOCK_RAW ||
 	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
@@ -1642,9 +1618,9 @@ int ip6mr_sk_done(struct sock *sk)
 
 	rtnl_lock();
 	ip6mr_for_each_table(mrt, net) {
-		if (sk == rtnl_dereference(mrt->mroute6_sk)) {
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
 			write_lock_bh(&mrt_lock);
-			RCU_INIT_POINTER(mrt->mroute6_sk, NULL);
+			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
 			net->ipv6.devconf_all->mc_forwarding--;
 			write_unlock_bh(&mrt_lock);
 			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1665,7 +1641,7 @@ int ip6mr_sk_done(struct sock *sk)
 
 bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
 {
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct flowi6 fl6 = {
 		.flowi6_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
 		.flowi6_oif	= skb->dev->ifindex,
@@ -1675,7 +1651,7 @@ bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
 	if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
 		return NULL;
 
-	return rcu_access_pointer(mrt->mroute6_sk);
+	return rcu_access_pointer(mrt->mroute_sk);
 }
 EXPORT_SYMBOL(mroute6_is_socket);
 
@@ -1693,7 +1669,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 	struct mf6cctl mfc;
 	mifi_t mifi;
 	struct net *net = sock_net(sk);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	if (sk->sk_type != SOCK_RAW ||
 	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
@@ -1704,7 +1680,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		return -ENOENT;
 
 	if (optname != MRT6_INIT) {
-		if (sk != rcu_access_pointer(mrt->mroute6_sk) &&
+		if (sk != rcu_access_pointer(mrt->mroute_sk) &&
 		    !ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EACCES;
 	}
@@ -1728,7 +1704,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 			return -ENFILE;
 		rtnl_lock();
 		ret = mif6_add(net, mrt, &vif,
-			       sk == rtnl_dereference(mrt->mroute6_sk));
+			       sk == rtnl_dereference(mrt->mroute_sk));
 		rtnl_unlock();
 		return ret;
 
@@ -1764,7 +1740,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		else
 			ret = ip6mr_mfc_add(net, mrt, &mfc,
 					    sk ==
-					    rtnl_dereference(mrt->mroute6_sk),
+					    rtnl_dereference(mrt->mroute_sk),
 					    parent);
 		rtnl_unlock();
 		return ret;
@@ -1817,7 +1793,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		/* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */
 		if (v != RT_TABLE_DEFAULT && v >= 100000000)
 			return -EINVAL;
-		if (sk == rcu_access_pointer(mrt->mroute6_sk))
+		if (sk == rcu_access_pointer(mrt->mroute_sk))
 			return -EBUSY;
 
 		rtnl_lock();
@@ -1848,7 +1824,7 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
 	int olr;
 	int val;
 	struct net *net = sock_net(sk);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	if (sk->sk_type != SOCK_RAW ||
 	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
@@ -1899,7 +1875,7 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 	struct vif_device *vif;
 	struct mfc6_cache *c;
 	struct net *net = sock_net(sk);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (!mrt)
@@ -1912,8 +1888,8 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 		if (vr.mifi >= mrt->maxvif)
 			return -EINVAL;
 		read_lock(&mrt_lock);
-		vif = &mrt->vif6_table[vr.mifi];
-		if (MIF_EXISTS(mrt, vr.mifi)) {
+		vif = &mrt->vif_table[vr.mifi];
+		if (VIF_EXISTS(mrt, vr.mifi)) {
 			vr.icount = vif->pkt_in;
 			vr.ocount = vif->pkt_out;
 			vr.ibytes = vif->bytes_in;
@@ -1973,7 +1949,7 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 	struct vif_device *vif;
 	struct mfc6_cache *c;
 	struct net *net = sock_net(sk);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (!mrt)
@@ -1986,8 +1962,8 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 		if (vr.mifi >= mrt->maxvif)
 			return -EINVAL;
 		read_lock(&mrt_lock);
-		vif = &mrt->vif6_table[vr.mifi];
-		if (MIF_EXISTS(mrt, vr.mifi)) {
+		vif = &mrt->vif_table[vr.mifi];
+		if (VIF_EXISTS(mrt, vr.mifi)) {
 			vr.icount = vif->pkt_in;
 			vr.ocount = vif->pkt_out;
 			vr.ibytes = vif->bytes_in;
@@ -2037,11 +2013,11 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct
  *	Processing handlers for ip6mr_forward
  */
 
-static int ip6mr_forward2(struct net *net, struct mr6_table *mrt,
+static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
 			  struct sk_buff *skb, struct mfc6_cache *c, int vifi)
 {
 	struct ipv6hdr *ipv6h;
-	struct vif_device *vif = &mrt->vif6_table[vifi];
+	struct vif_device *vif = &mrt->vif_table[vifi];
 	struct net_device *dev;
 	struct dst_entry *dst;
 	struct flowi6 fl6;
@@ -2111,18 +2087,18 @@ out_free:
 	return 0;
 }
 
-static int ip6mr_find_vif(struct mr6_table *mrt, struct net_device *dev)
+static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
 {
 	int ct;
 
 	for (ct = mrt->maxvif - 1; ct >= 0; ct--) {
-		if (mrt->vif6_table[ct].dev == dev)
+		if (mrt->vif_table[ct].dev == dev)
 			break;
 	}
 	return ct;
 }
 
-static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
+static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 			   struct sk_buff *skb, struct mfc6_cache *cache)
 {
 	int psend = -1;
@@ -2153,7 +2129,7 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
 	/*
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
-	if (mrt->vif6_table[vif].dev != skb->dev) {
+	if (mrt->vif_table[vif].dev != skb->dev) {
 		cache->mfc_un.res.wrong_if++;
 
 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
@@ -2173,8 +2149,8 @@ static void ip6_mr_forward(struct net *net, struct mr6_table *mrt,
 	}
 
 forward:
-	mrt->vif6_table[vif].pkt_in++;
-	mrt->vif6_table[vif].bytes_in += skb->len;
+	mrt->vif_table[vif].pkt_in++;
+	mrt->vif_table[vif].bytes_in += skb->len;
 
 	/*
 	 *	Forward the frame
@@ -2225,7 +2201,7 @@ int ip6_mr_input(struct sk_buff *skb)
 {
 	struct mfc6_cache *cache;
 	struct net *net = dev_net(skb->dev);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct flowi6 fl6 = {
 		.flowi6_iif	= skb->dev->ifindex,
 		.flowi6_mark	= skb->mark,
@@ -2276,7 +2252,7 @@ int ip6_mr_input(struct sk_buff *skb)
 }
 
 
-static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			       struct mfc6_cache *c, struct rtmsg *rtm)
 {
 	struct rta_mfc_stats mfcs;
@@ -2291,15 +2267,16 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 		return -ENOENT;
 	}
 
-	if (MIF_EXISTS(mrt, c->mf6c_parent) &&
-	    nla_put_u32(skb, RTA_IIF, mrt->vif6_table[c->mf6c_parent].dev->ifindex) < 0)
+	if (VIF_EXISTS(mrt, c->mf6c_parent) &&
+	    nla_put_u32(skb, RTA_IIF,
+			mrt->vif_table[c->mf6c_parent].dev->ifindex) < 0)
 		return -EMSGSIZE;
 	mp_attr = nla_nest_start(skb, RTA_MULTIPATH);
 	if (!mp_attr)
 		return -EMSGSIZE;
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (MIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
 			nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
 			if (!nhp) {
 				nla_nest_cancel(skb, mp_attr);
@@ -2308,7 +2285,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
 
 			nhp->rtnh_flags = 0;
 			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
-			nhp->rtnh_ifindex = mrt->vif6_table[ct].dev->ifindex;
+			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
 	}
@@ -2334,7 +2311,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		    u32 portid)
 {
 	int err;
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct mfc6_cache *cache;
 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 
@@ -2403,7 +2380,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 	return err;
 }
 
-static int ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb,
+static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			     u32 portid, u32 seq, struct mfc6_cache *c, int cmd,
 			     int flags)
 {
@@ -2468,7 +2445,7 @@ static int mr6_msgsize(bool unresolved, int maxvif)
 	return len;
 }
 
-static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
+static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
 			      int cmd)
 {
 	struct net *net = read_pnet(&mrt->net);
@@ -2510,7 +2487,7 @@ static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
 	return len;
 }
 
-static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt)
+static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
 {
 	struct net *net = read_pnet(&mrt->net);
 	struct nlmsghdr *nlh;
@@ -2561,7 +2538,7 @@ errout:
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct mr6_table *mrt;
+	struct mr_table *mrt;
 	struct mfc6_cache *mfc;
 	unsigned int t = 0, s_t;
 	unsigned int e = 0, s_e;
@@ -2573,7 +2550,7 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	ip6mr_for_each_table(mrt, net) {
 		if (t < s_t)
 			goto next_table;
-		list_for_each_entry_rcu(mfc, &mrt->mfc6_cache_list, list) {
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
 			if (e < s_e)
 				goto next_entry;
 			if (ip6mr_fill_mroute(mrt, skb,
@@ -2589,7 +2566,7 @@ next_entry:
 		s_e = 0;
 
 		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry(mfc, &mrt->mfc6_unres_queue, list) {
+		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
 			if (e < s_e)
 				goto next_entry2;
 			if (ip6mr_fill_mroute(mrt, skb,
-- 
cgit v1.2.3


From 0bbbf0e7d0e7ea8267836986346a9b3a35b74e4e Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:33 +0200
Subject: ipmr, ip6mr: Unite creation of new mr_table

Now that both ipmr and ip6mr are using the same mr_table structure,
we can have a common function to allocate & initialize a new instance.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c      | 27 ++++++++++-----------------
 net/ipv4/ipmr_base.c | 27 +++++++++++++++++++++++++++
 net/ipv6/ip6mr.c     | 30 ++++++++++--------------------
 3 files changed, 47 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a1bf0020cad1..f213933db177 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -352,6 +352,14 @@ static const struct rhashtable_params ipmr_rht_params = {
 	.automatic_shrinking = true,
 };
 
+static void ipmr_new_table_set(struct mr_table *mrt,
+			       struct net *net)
+{
+#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
+#endif
+}
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
@@ -364,23 +372,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	if (mrt)
 		return mrt;
 
-	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
-	if (!mrt)
-		return ERR_PTR(-ENOMEM);
-	write_pnet(&mrt->net, net);
-	mrt->id = id;
-
-	rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
-	INIT_LIST_HEAD(&mrt->mfc_cache_list);
-	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
-
-	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
-
-	mrt->mroute_reg_vif_num = -1;
-#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
-	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
-#endif
-	return mrt;
+	return mr_table_alloc(net, id, &ipmr_rht_params,
+			      ipmr_expire_process, ipmr_new_table_set);
 }
 
 static void ipmr_free_table(struct mr_table *mrt)
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 22758f848344..3e21a5806d0e 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -26,3 +26,30 @@ void vif_device_init(struct vif_device *v,
 		v->link = dev->ifindex;
 }
 EXPORT_SYMBOL(vif_device_init);
+
+struct mr_table *
+mr_table_alloc(struct net *net, u32 id,
+	       const struct rhashtable_params *rht_params,
+	       void (*expire_func)(struct timer_list *t),
+	       void (*table_set)(struct mr_table *mrt,
+				 struct net *net))
+{
+	struct mr_table *mrt;
+
+	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
+	if (!mrt)
+		return NULL;
+	mrt->id = id;
+	write_pnet(&mrt->net, net);
+
+	rhltable_init(&mrt->mfc_hash, rht_params);
+	INIT_LIST_HEAD(&mrt->mfc_cache_list);
+	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
+
+	timer_setup(&mrt->ipmr_expire_timer, expire_func, 0);
+
+	mrt->mroute_reg_vif_num = -1;
+	table_set(mrt, net);
+	return mrt;
+}
+EXPORT_SYMBOL(mr_table_alloc);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index adbb826ca8fd..d50852882966 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -31,7 +31,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/compat.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -295,6 +294,14 @@ static const struct rhashtable_params ip6mr_rht_params = {
 	.automatic_shrinking = true,
 };
 
+static void ip6mr_new_table_set(struct mr_table *mrt,
+				struct net *net)
+{
+#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
+	list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables);
+#endif
+}
+
 static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
@@ -303,25 +310,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 	if (mrt)
 		return mrt;
 
-	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
-	if (!mrt)
-		return NULL;
-	mrt->id = id;
-	write_pnet(&mrt->net, net);
-
-	rhltable_init(&mrt->mfc_hash, &ip6mr_rht_params);
-	INIT_LIST_HEAD(&mrt->mfc_cache_list);
-	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
-
-	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
-
-#ifdef CONFIG_IPV6_PIMSM_V2
-	mrt->mroute_reg_vif_num = -1;
-#endif
-#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
-	list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables);
-#endif
-	return mrt;
+	return mr_table_alloc(net, id, &ip6mr_rht_params,
+			      ipmr_expire_process, ip6mr_new_table_set);
 }
 
 static void ip6mr_free_table(struct mr_table *mrt)
-- 
cgit v1.2.3


From 494fff56379c4ad5b8fe36a5b7ffede4044ca7bb Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:34 +0200
Subject: ipmr, ip6mr: Make mfc_cache a common structure

mfc_cache and mfc6_cache are almost identical - the main difference is
in the origin/group addresses and comparison-key. Make a common
structure encapsulating most of the multicast routing logic  - mr_mfc
and convert both ipmr and ip6mr into using it.

For easy conversion [casting, in this case] mr_mfc has to be the first
field inside every multicast routing abstraction utilizing it.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c  | 233 +++++++++++++++++++++++++++------------------------
 net/ipv6/ip6mr.c | 248 ++++++++++++++++++++++++++++---------------------------
 2 files changed, 251 insertions(+), 230 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index f213933db177..3f7515086e85 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -106,7 +106,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 static int ipmr_cache_report(struct mr_table *mrt,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-			      struct mfc_cache *c, struct rtmsg *rtm);
+			      struct mr_mfc *c, struct rtmsg *rtm);
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 				 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
@@ -343,7 +343,7 @@ static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
 }
 
 static const struct rhashtable_params ipmr_rht_params = {
-	.head_offset = offsetof(struct mfc_cache, mnode),
+	.head_offset = offsetof(struct mr_mfc, mnode),
 	.key_offset = offsetof(struct mfc_cache, cmparg),
 	.key_len = sizeof(struct mfc_cache_cmp_arg),
 	.nelem_hint = 3,
@@ -752,14 +752,14 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 
 static void ipmr_cache_free_rcu(struct rcu_head *head)
 {
-	struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+	struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
 
-	kmem_cache_free(mrt_cachep, c);
+	kmem_cache_free(mrt_cachep, (struct mfc_cache *)c);
 }
 
 void ipmr_cache_free(struct mfc_cache *c)
 {
-	call_rcu(&c->rcu, ipmr_cache_free_rcu);
+	call_rcu(&c->_c.rcu, ipmr_cache_free_rcu);
 }
 EXPORT_SYMBOL(ipmr_cache_free);
 
@@ -774,7 +774,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 
 	atomic_dec(&mrt->cache_resolve_queue_len);
 
-	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
+	while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) {
 		if (ip_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = skb_pull(skb,
 							sizeof(struct iphdr));
@@ -798,9 +798,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 static void ipmr_expire_process(struct timer_list *t)
 {
 	struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
-	unsigned long now;
+	struct mr_mfc *c, *next;
 	unsigned long expires;
-	struct mfc_cache *c, *next;
+	unsigned long now;
 
 	if (!spin_trylock(&mfc_unres_lock)) {
 		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
@@ -822,8 +822,8 @@ static void ipmr_expire_process(struct timer_list *t)
 		}
 
 		list_del(&c->list);
-		mroute_netlink_event(mrt, c, RTM_DELROUTE);
-		ipmr_destroy_unres(mrt, c);
+		mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE);
+		ipmr_destroy_unres(mrt, (struct mfc_cache *)c);
 	}
 
 	if (!list_empty(&mrt->mfc_unres_queue))
@@ -834,7 +834,7 @@ out:
 }
 
 /* Fill oifs list. It is called under write locked mrt_lock. */
-static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
+static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
 				   unsigned char *ttls)
 {
 	int vifi;
@@ -974,11 +974,11 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 			.mfc_origin = origin
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc_cache *c;
+	struct mr_mfc *c;
 
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		return c;
+		return (struct mfc_cache *)c;
 
 	return NULL;
 }
@@ -992,12 +992,12 @@ static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
 			.mfc_origin = htonl(INADDR_ANY)
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc_cache *c;
+	struct mr_mfc *c;
 
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
 		if (c->mfc_un.res.ttls[vifi] < 255)
-			return c;
+			return (struct mfc_cache *)c;
 
 	return NULL;
 }
@@ -1011,20 +1011,22 @@ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
 			.mfc_origin = htonl(INADDR_ANY)
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc_cache *c, *proxy;
+	struct mr_mfc *c;
 
 	if (mcastgrp == htonl(INADDR_ANY))
 		goto skip;
 
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+		struct mfc_cache *proxy;
+
 		if (c->mfc_un.res.ttls[vifi] < 255)
-			return c;
+			return (struct mfc_cache *)c;
 
 		/* It's ok if the vifi is part of the static tree */
 		proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
-		if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
-			return c;
+		if (proxy && proxy->_c.mfc_un.res.ttls[vifi] < 255)
+			return (struct mfc_cache *)c;
 	}
 
 skip:
@@ -1041,12 +1043,12 @@ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
 			.mfc_origin = origin,
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc_cache *c;
+	struct mr_mfc *c;
 
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
 		if (parent == -1 || parent == c->mfc_parent)
-			return c;
+			return (struct mfc_cache *)c;
 
 	return NULL;
 }
@@ -1057,9 +1059,9 @@ static struct mfc_cache *ipmr_cache_alloc(void)
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 
 	if (c) {
-		c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
-		c->mfc_un.res.minvif = MAXVIFS;
-		refcount_set(&c->mfc_un.res.refcount, 1);
+		c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
+		c->_c.mfc_un.res.minvif = MAXVIFS;
+		refcount_set(&c->_c.mfc_un.res.refcount, 1);
 	}
 	return c;
 }
@@ -1069,8 +1071,8 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
 
 	if (c) {
-		skb_queue_head_init(&c->mfc_un.unres.unresolved);
-		c->mfc_un.unres.expires = jiffies + 10*HZ;
+		skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
+		c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
 	}
 	return c;
 }
@@ -1083,12 +1085,13 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 	struct nlmsgerr *e;
 
 	/* Play the pending entries through our router */
-	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+	while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
 		if (ip_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = skb_pull(skb,
 							sizeof(struct iphdr));
 
-			if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
+			if (__ipmr_fill_mroute(mrt, skb, &c->_c,
+					       nlmsg_data(nlh)) > 0) {
 				nlh->nlmsg_len = skb_tail_pointer(skb) -
 						 (u8 *)nlh;
 			} else {
@@ -1196,7 +1199,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 	int err;
 
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
+	list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
 		if (c->mfc_mcastgrp == iph->daddr &&
 		    c->mfc_origin == iph->saddr) {
 			found = true;
@@ -1215,12 +1218,13 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 		}
 
 		/* Fill in the new cache entry */
-		c->mfc_parent	= -1;
+		c->_c.mfc_parent = -1;
 		c->mfc_origin	= iph->saddr;
 		c->mfc_mcastgrp	= iph->daddr;
 
 		/* Reflect first query at mrouted. */
 		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
+
 		if (err < 0) {
 			/* If the report failed throw the cache entry
 			   out - Brad Parker
@@ -1233,15 +1237,16 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 		}
 
 		atomic_inc(&mrt->cache_resolve_queue_len);
-		list_add(&c->list, &mrt->mfc_unres_queue);
+		list_add(&c->_c.list, &mrt->mfc_unres_queue);
 		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
 
 		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
-			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
+			mod_timer(&mrt->ipmr_expire_timer,
+				  c->_c.mfc_un.unres.expires);
 	}
 
 	/* See if we can append the packet */
-	if (c->mfc_un.unres.unresolved.qlen > 3) {
+	if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
 		kfree_skb(skb);
 		err = -ENOBUFS;
 	} else {
@@ -1249,7 +1254,7 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
 			skb->dev = dev;
 			skb->skb_iif = dev->ifindex;
 		}
-		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
+		skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
 		err = 0;
 	}
 
@@ -1271,8 +1276,8 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 	rcu_read_unlock();
 	if (!c)
 		return -ENOENT;
-	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
-	list_del_rcu(&c->list);
+	rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params);
+	list_del_rcu(&c->_c.list);
 	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
 	mroute_netlink_event(mrt, c, RTM_DELROUTE);
 	ipmr_cache_put(c);
@@ -1284,6 +1289,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 			struct mfcctl *mfc, int mrtsock, int parent)
 {
 	struct mfc_cache *uc, *c;
+	struct mr_mfc *_uc;
 	bool found;
 	int ret;
 
@@ -1297,10 +1303,10 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 	rcu_read_unlock();
 	if (c) {
 		write_lock_bh(&mrt_lock);
-		c->mfc_parent = mfc->mfcc_parent;
-		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+		c->_c.mfc_parent = mfc->mfcc_parent;
+		ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
 		if (!mrtsock)
-			c->mfc_flags |= MFC_STATIC;
+			c->_c.mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
 		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
 					      mrt->id);
@@ -1318,28 +1324,29 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 
 	c->mfc_origin = mfc->mfcc_origin.s_addr;
 	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
-	c->mfc_parent = mfc->mfcc_parent;
-	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
+	c->_c.mfc_parent = mfc->mfcc_parent;
+	ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
 	if (!mrtsock)
-		c->mfc_flags |= MFC_STATIC;
+		c->_c.mfc_flags |= MFC_STATIC;
 
-	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
 				  ipmr_rht_params);
 	if (ret) {
 		pr_err("ipmr: rhtable insert error %d\n", ret);
 		ipmr_cache_free(c);
 		return ret;
 	}
-	list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
+	list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
 	/* Check to see if we resolved a queued list. If so we
 	 * need to send on the frames and tidy up.
 	 */
 	found = false;
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
+	list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
+		uc = (struct mfc_cache *)_uc;
 		if (uc->mfc_origin == c->mfc_origin &&
 		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
-			list_del(&uc->list);
+			list_del(&_uc->list);
 			atomic_dec(&mrt->cache_resolve_queue_len);
 			found = true;
 			break;
@@ -1362,7 +1369,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
 	struct net *net = read_pnet(&mrt->net);
-	struct mfc_cache *c, *tmp;
+	struct mr_mfc *c, *tmp;
+	struct mfc_cache *cache;
 	LIST_HEAD(list);
 	int i;
 
@@ -1380,18 +1388,20 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 			continue;
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
 		list_del_rcu(&c->list);
-		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c,
+		cache = (struct mfc_cache *)c;
+		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
 					      mrt->id);
-		mroute_netlink_event(mrt, c, RTM_DELROUTE);
-		ipmr_cache_put(c);
+		mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+		ipmr_cache_put(cache);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
-			mroute_netlink_event(mrt, c, RTM_DELROUTE);
-			ipmr_destroy_unres(mrt, c);
+			cache = (struct mfc_cache *)c;
+			mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+			ipmr_destroy_unres(mrt, cache);
 		}
 		spin_unlock_bh(&mfc_unres_lock);
 	}
@@ -1683,9 +1693,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 		rcu_read_lock();
 		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
 		if (c) {
-			sr.pktcnt = c->mfc_un.res.pkt;
-			sr.bytecnt = c->mfc_un.res.bytes;
-			sr.wrong_if = c->mfc_un.res.wrong_if;
+			sr.pktcnt = c->_c.mfc_un.res.pkt;
+			sr.bytecnt = c->_c.mfc_un.res.bytes;
+			sr.wrong_if = c->_c.mfc_un.res.wrong_if;
 			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -1757,9 +1767,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 		rcu_read_lock();
 		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
 		if (c) {
-			sr.pktcnt = c->mfc_un.res.pkt;
-			sr.bytecnt = c->mfc_un.res.bytes;
-			sr.wrong_if = c->mfc_un.res.wrong_if;
+			sr.pktcnt = c->_c.mfc_un.res.pkt;
+			sr.bytecnt = c->_c.mfc_un.res.bytes;
+			sr.wrong_if = c->_c.mfc_un.res.wrong_if;
 			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -1983,18 +1993,18 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
 /* "local" means that we should preserve one skb (for local delivery) */
 static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			  struct net_device *dev, struct sk_buff *skb,
-			  struct mfc_cache *cache, int local)
+			  struct mfc_cache *c, int local)
 {
 	int true_vifi = ipmr_find_vif(mrt, dev);
 	int psend = -1;
 	int vif, ct;
 
-	vif = cache->mfc_parent;
-	cache->mfc_un.res.pkt++;
-	cache->mfc_un.res.bytes += skb->len;
-	cache->mfc_un.res.lastuse = jiffies;
+	vif = c->_c.mfc_parent;
+	c->_c.mfc_un.res.pkt++;
+	c->_c.mfc_un.res.bytes += skb->len;
+	c->_c.mfc_un.res.lastuse = jiffies;
 
-	if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
+	if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
 		struct mfc_cache *cache_proxy;
 
 		/* For an (*,G) entry, we only check that the incomming
@@ -2002,7 +2012,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 		 */
 		cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
 		if (cache_proxy &&
-		    cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
+		    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
 			goto forward;
 	}
 
@@ -2023,7 +2033,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			goto dont_forward;
 		}
 
-		cache->mfc_un.res.wrong_if++;
+		c->_c.mfc_un.res.wrong_if++;
 
 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
@@ -2032,10 +2042,11 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 		     * large chunk of pimd to kernel. Ough... --ANK
 		     */
 		    (mrt->mroute_do_pim ||
-		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
+		     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
 		    time_after(jiffies,
-			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
-			cache->mfc_un.res.last_assert = jiffies;
+			       c->_c.mfc_un.res.last_assert +
+			       MFC_ASSERT_THRESH)) {
+			c->_c.mfc_un.res.last_assert = jiffies;
 			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
 		}
 		goto dont_forward;
@@ -2046,33 +2057,33 @@ forward:
 	mrt->vif_table[vif].bytes_in += skb->len;
 
 	/* Forward the frame */
-	if (cache->mfc_origin == htonl(INADDR_ANY) &&
-	    cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
+	if (c->mfc_origin == htonl(INADDR_ANY) &&
+	    c->mfc_mcastgrp == htonl(INADDR_ANY)) {
 		if (true_vifi >= 0 &&
-		    true_vifi != cache->mfc_parent &&
+		    true_vifi != c->_c.mfc_parent &&
 		    ip_hdr(skb)->ttl >
-				cache->mfc_un.res.ttls[cache->mfc_parent]) {
+				c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
 			/* It's an (*,*) entry and the packet is not coming from
 			 * the upstream: forward the packet to the upstream
 			 * only.
 			 */
-			psend = cache->mfc_parent;
+			psend = c->_c.mfc_parent;
 			goto last_forward;
 		}
 		goto dont_forward;
 	}
-	for (ct = cache->mfc_un.res.maxvif - 1;
-	     ct >= cache->mfc_un.res.minvif; ct--) {
+	for (ct = c->_c.mfc_un.res.maxvif - 1;
+	     ct >= c->_c.mfc_un.res.minvif; ct--) {
 		/* For (*,G) entry, don't forward to the incoming interface */
-		if ((cache->mfc_origin != htonl(INADDR_ANY) ||
+		if ((c->mfc_origin != htonl(INADDR_ANY) ||
 		     ct != true_vifi) &&
-		    ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
+		    ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
 			if (psend != -1) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 
 				if (skb2)
 					ipmr_queue_xmit(net, mrt, true_vifi,
-							skb2, cache, psend);
+							skb2, c, psend);
 			}
 			psend = ct;
 		}
@@ -2084,9 +2095,9 @@ last_forward:
 
 			if (skb2)
 				ipmr_queue_xmit(net, mrt, true_vifi, skb2,
-						cache, psend);
+						c, psend);
 		} else {
-			ipmr_queue_xmit(net, mrt, true_vifi, skb, cache, psend);
+			ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend);
 			return;
 		}
 	}
@@ -2285,7 +2296,7 @@ drop:
 #endif
 
 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-			      struct mfc_cache *c, struct rtmsg *rtm)
+			      struct mr_mfc *c, struct rtmsg *rtm)
 {
 	struct rta_mfc_stats mfcs;
 	struct nlattr *mp_attr;
@@ -2401,7 +2412,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 	}
 
 	read_lock(&mrt_lock);
-	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
+	err = __ipmr_fill_mroute(mrt, skb, &cache->_c, rtm);
 	read_unlock(&mrt_lock);
 	rcu_read_unlock();
 	return err;
@@ -2429,7 +2440,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 		goto nla_put_failure;
 	rtm->rtm_type     = RTN_MULTICAST;
 	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
-	if (c->mfc_flags & MFC_STATIC)
+	if (c->_c.mfc_flags & MFC_STATIC)
 		rtm->rtm_protocol = RTPROT_STATIC;
 	else
 		rtm->rtm_protocol = RTPROT_MROUTED;
@@ -2438,7 +2449,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
 	    nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
 		goto nla_put_failure;
-	err = __ipmr_fill_mroute(mrt, skb, c, rtm);
+	err = __ipmr_fill_mroute(mrt, skb, &c->_c, rtm);
 	/* do not break the dump if cache is unresolved */
 	if (err < 0 && err != -ENOENT)
 		goto nla_put_failure;
@@ -2479,7 +2490,8 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
-	skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
+	skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS,
+				       mrt->maxvif),
 			GFP_ATOMIC);
 	if (!skb)
 		goto errout;
@@ -2624,10 +2636,10 @@ errout_free:
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct mr_table *mrt;
-	struct mfc_cache *mfc;
 	unsigned int t = 0, s_t;
 	unsigned int e = 0, s_e;
+	struct mr_table *mrt;
+	struct mr_mfc *mfc;
 
 	s_t = cb->args[0];
 	s_e = cb->args[1];
@@ -2642,8 +2654,8 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 			if (ipmr_fill_mroute(mrt, skb,
 					     NETLINK_CB(cb->skb).portid,
 					     cb->nlh->nlmsg_seq,
-					     mfc, RTM_NEWROUTE,
-					     NLM_F_MULTI) < 0)
+					     (struct mfc_cache *)mfc,
+					     RTM_NEWROUTE, NLM_F_MULTI) < 0)
 				goto done;
 next_entry:
 			e++;
@@ -2658,8 +2670,8 @@ next_entry:
 			if (ipmr_fill_mroute(mrt, skb,
 					     NETLINK_CB(cb->skb).portid,
 					     cb->nlh->nlmsg_seq,
-					     mfc, RTM_NEWROUTE,
-					     NLM_F_MULTI) < 0) {
+					     (struct mfc_cache *)mfc,
+					     RTM_NEWROUTE, NLM_F_MULTI) < 0) {
 				spin_unlock_bh(&mfc_unres_lock);
 				goto done;
 			}
@@ -3051,20 +3063,21 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 					  struct ipmr_mfc_iter *it, loff_t pos)
 {
 	struct mr_table *mrt = it->mrt;
-	struct mfc_cache *mfc;
+	struct mr_mfc *mfc;
 
 	rcu_read_lock();
 	it->cache = &mrt->mfc_cache_list;
 	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
 		if (pos-- == 0)
-			return mfc;
+			return (struct mfc_cache *)mfc;
 	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
 	it->cache = &mrt->mfc_unres_queue;
 	list_for_each_entry(mfc, it->cache, list)
 		if (pos-- == 0)
-			return mfc;
+			return (struct mfc_cache *)mfc;
+
 	spin_unlock_bh(&mfc_unres_lock);
 
 	it->cache = NULL;
@@ -3100,8 +3113,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (v == SEQ_START_TOKEN)
 		return ipmr_mfc_seq_idx(net, seq->private, 0);
 
-	if (mfc->list.next != it->cache)
-		return list_entry(mfc->list.next, struct mfc_cache, list);
+	if (mfc->_c.list.next != it->cache)
+		return (struct mfc_cache *)(list_entry(mfc->_c.list.next,
+						       struct mr_mfc, list));
 
 	if (it->cache == &mrt->mfc_unres_queue)
 		goto end_of_list;
@@ -3112,7 +3126,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	spin_lock_bh(&mfc_unres_lock);
 	if (!list_empty(it->cache))
-		return list_first_entry(it->cache, struct mfc_cache, list);
+		return (struct mfc_cache *)(list_first_entry(it->cache,
+							     struct mr_mfc,
+							     list));
 
 end_of_list:
 	spin_unlock_bh(&mfc_unres_lock);
@@ -3147,20 +3163,20 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "%08X %08X %-3hd",
 			   (__force u32) mfc->mfc_mcastgrp,
 			   (__force u32) mfc->mfc_origin,
-			   mfc->mfc_parent);
+			   mfc->_c.mfc_parent);
 
 		if (it->cache != &mrt->mfc_unres_queue) {
 			seq_printf(seq, " %8lu %8lu %8lu",
-				   mfc->mfc_un.res.pkt,
-				   mfc->mfc_un.res.bytes,
-				   mfc->mfc_un.res.wrong_if);
-			for (n = mfc->mfc_un.res.minvif;
-			     n < mfc->mfc_un.res.maxvif; n++) {
+				   mfc->_c.mfc_un.res.pkt,
+				   mfc->_c.mfc_un.res.bytes,
+				   mfc->_c.mfc_un.res.wrong_if);
+			for (n = mfc->_c.mfc_un.res.minvif;
+			     n < mfc->_c.mfc_un.res.maxvif; n++) {
 				if (VIF_EXISTS(mrt, n) &&
-				    mfc->mfc_un.res.ttls[n] < 255)
+				    mfc->_c.mfc_un.res.ttls[n] < 255)
 					seq_printf(seq,
 					   " %2d:%-3d",
-					   n, mfc->mfc_un.res.ttls[n]);
+					   n, mfc->_c.mfc_un.res.ttls[n]);
 			}
 		} else {
 			/* unresolved mfc_caches don't contain
@@ -3219,7 +3235,7 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb)
 
 	ipmr_for_each_table(mrt, net) {
 		struct vif_device *v = &mrt->vif_table[0];
-		struct mfc_cache *mfc;
+		struct mr_mfc *mfc;
 		int vifi;
 
 		/* Notifiy on table VIF entries */
@@ -3236,7 +3252,8 @@ static int ipmr_dump(struct net *net, struct notifier_block *nb)
 		/* Notify on table MFC entries */
 		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
 			call_ipmr_mfc_entry_notifier(nb, net,
-						     FIB_EVENT_ENTRY_ADD, mfc,
+						     FIB_EVENT_ENTRY_ADD,
+						     (struct mfc_cache *)mfc,
 						     mrt->id);
 	}
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d50852882966..3fe254f9f9b7 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -285,7 +285,7 @@ static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
 }
 
 static const struct rhashtable_params ip6mr_rht_params = {
-	.head_offset = offsetof(struct mfc6_cache, mnode),
+	.head_offset = offsetof(struct mr_mfc, mnode),
 	.key_offset = offsetof(struct mfc6_cache, cmparg),
 	.key_len = sizeof(struct mfc6_cache_cmp_arg),
 	.nelem_hint = 3,
@@ -335,20 +335,20 @@ static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
 					   struct ipmr_mfc_iter *it, loff_t pos)
 {
 	struct mr_table *mrt = it->mrt;
-	struct mfc6_cache *mfc;
+	struct mr_mfc *mfc;
 
 	rcu_read_lock();
 	it->cache = &mrt->mfc_cache_list;
 	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
 		if (pos-- == 0)
-			return mfc;
+			return (struct mfc6_cache *)mfc;
 	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
 	it->cache = &mrt->mfc_unres_queue;
 	list_for_each_entry(mfc, it->cache, list)
 		if (pos-- == 0)
-			return mfc;
+			return (struct mfc6_cache *)mfc;
 	spin_unlock_bh(&mfc_unres_lock);
 
 	it->cache = NULL;
@@ -492,8 +492,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (v == SEQ_START_TOKEN)
 		return ipmr_mfc_seq_idx(net, seq->private, 0);
 
-	if (mfc->list.next != it->cache)
-		return list_entry(mfc->list.next, struct mfc6_cache, list);
+	if (mfc->_c.list.next != it->cache)
+		return (struct mfc6_cache *)(list_entry(mfc->_c.list.next,
+							struct mr_mfc, list));
 
 	if (it->cache == &mrt->mfc_unres_queue)
 		goto end_of_list;
@@ -504,7 +505,9 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	spin_lock_bh(&mfc_unres_lock);
 	if (!list_empty(it->cache))
-		return list_first_entry(it->cache, struct mfc6_cache, list);
+		return (struct mfc6_cache *)(list_first_entry(it->cache,
+							      struct mr_mfc,
+							      list));
 
  end_of_list:
 	spin_unlock_bh(&mfc_unres_lock);
@@ -540,20 +543,20 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 
 		seq_printf(seq, "%pI6 %pI6 %-3hd",
 			   &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
-			   mfc->mf6c_parent);
+			   mfc->_c.mfc_parent);
 
 		if (it->cache != &mrt->mfc_unres_queue) {
 			seq_printf(seq, " %8lu %8lu %8lu",
-				   mfc->mfc_un.res.pkt,
-				   mfc->mfc_un.res.bytes,
-				   mfc->mfc_un.res.wrong_if);
-			for (n = mfc->mfc_un.res.minvif;
-			     n < mfc->mfc_un.res.maxvif; n++) {
+				   mfc->_c.mfc_un.res.pkt,
+				   mfc->_c.mfc_un.res.bytes,
+				   mfc->_c.mfc_un.res.wrong_if);
+			for (n = mfc->_c.mfc_un.res.minvif;
+			     n < mfc->_c.mfc_un.res.maxvif; n++) {
 				if (VIF_EXISTS(mrt, n) &&
-				    mfc->mfc_un.res.ttls[n] < 255)
+				    mfc->_c.mfc_un.res.ttls[n] < 255)
 					seq_printf(seq,
-						   " %2d:%-3d",
-						   n, mfc->mfc_un.res.ttls[n]);
+						   " %2d:%-3d", n,
+						   mfc->_c.mfc_un.res.ttls[n]);
 			}
 		} else {
 			/* unresolved mfc_caches don't contain
@@ -800,14 +803,14 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 
 static inline void ip6mr_cache_free_rcu(struct rcu_head *head)
 {
-	struct mfc6_cache *c = container_of(head, struct mfc6_cache, rcu);
+	struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
 
-	kmem_cache_free(mrt_cachep, c);
+	kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c);
 }
 
 static inline void ip6mr_cache_free(struct mfc6_cache *c)
 {
-	call_rcu(&c->rcu, ip6mr_cache_free_rcu);
+	call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu);
 }
 
 /* Destroy an unresolved cache entry, killing queued skbs
@@ -821,7 +824,7 @@ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c)
 
 	atomic_dec(&mrt->cache_resolve_queue_len);
 
-	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved)) != NULL) {
+	while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) {
 		if (ipv6_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = skb_pull(skb,
 							sizeof(struct ipv6hdr));
@@ -844,7 +847,7 @@ static void ipmr_do_expire_process(struct mr_table *mrt)
 {
 	unsigned long now = jiffies;
 	unsigned long expires = 10 * HZ;
-	struct mfc6_cache *c, *next;
+	struct mr_mfc *c, *next;
 
 	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
 		if (time_after(c->mfc_un.unres.expires, now)) {
@@ -856,8 +859,8 @@ static void ipmr_do_expire_process(struct mr_table *mrt)
 		}
 
 		list_del(&c->list);
-		mr6_netlink_event(mrt, c, RTM_DELROUTE);
-		ip6mr_destroy_unres(mrt, c);
+		mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
+		ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
 	}
 
 	if (!list_empty(&mrt->mfc_unres_queue))
@@ -882,7 +885,7 @@ static void ipmr_expire_process(struct timer_list *t)
 /* Fill oifs list. It is called under write locked mrt_lock. */
 
 static void ip6mr_update_thresholds(struct mr_table *mrt,
-				    struct mfc6_cache *cache,
+				    struct mr_mfc *cache,
 				    unsigned char *ttls)
 {
 	int vifi;
@@ -986,11 +989,11 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt,
 		.mf6c_mcastgrp = *mcastgrp,
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc6_cache *c;
+	struct mr_mfc *c;
 
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		return c;
+		return (struct mfc6_cache *)c;
 
 	return NULL;
 }
@@ -1004,12 +1007,12 @@ static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr_table *mrt,
 		.mf6c_mcastgrp = in6addr_any,
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc6_cache *c;
+	struct mr_mfc *c;
 
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
 		if (c->mfc_un.res.ttls[mifi] < 255)
-			return c;
+			return (struct mfc6_cache *)c;
 
 	return NULL;
 }
@@ -1024,7 +1027,8 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt,
 		.mf6c_mcastgrp = *mcastgrp,
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc6_cache *c, *proxy;
+	struct mr_mfc *c;
+	struct mfc6_cache *proxy;
 
 	if (ipv6_addr_any(mcastgrp))
 		goto skip;
@@ -1032,12 +1036,12 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt,
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
 		if (c->mfc_un.res.ttls[mifi] < 255)
-			return c;
+			return (struct mfc6_cache *)c;
 
 		/* It's ok if the mifi is part of the static tree */
-		proxy = ip6mr_cache_find_any_parent(mrt, c->mf6c_parent);
-		if (proxy && proxy->mfc_un.res.ttls[mifi] < 255)
-			return c;
+		proxy = ip6mr_cache_find_any_parent(mrt, c->mfc_parent);
+		if (proxy && proxy->_c.mfc_un.res.ttls[mifi] < 255)
+			return (struct mfc6_cache *)c;
 	}
 
 skip:
@@ -1056,12 +1060,12 @@ ip6mr_cache_find_parent(struct mr_table *mrt,
 		.mf6c_mcastgrp = *mcastgrp,
 	};
 	struct rhlist_head *tmp, *list;
-	struct mfc6_cache *c;
+	struct mr_mfc *c;
 
 	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
 	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		if (parent == -1 || parent == c->mf6c_parent)
-			return c;
+		if (parent == -1 || parent == c->mfc_parent)
+			return (struct mfc6_cache *)c;
 
 	return NULL;
 }
@@ -1074,8 +1078,8 @@ static struct mfc6_cache *ip6mr_cache_alloc(void)
 	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
 	if (!c)
 		return NULL;
-	c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
-	c->mfc_un.res.minvif = MAXMIFS;
+	c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
+	c->_c.mfc_un.res.minvif = MAXMIFS;
 	return c;
 }
 
@@ -1084,8 +1088,8 @@ static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
 	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
 	if (!c)
 		return NULL;
-	skb_queue_head_init(&c->mfc_un.unres.unresolved);
-	c->mfc_un.unres.expires = jiffies + 10 * HZ;
+	skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
+	c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
 	return c;
 }
 
@@ -1102,7 +1106,7 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
 	 *	Play the pending entries through our router
 	 */
 
-	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+	while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
 		if (ipv6_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = skb_pull(skb,
 							sizeof(struct ipv6hdr));
@@ -1221,19 +1225,16 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 	return ret;
 }
 
-/*
- *	Queue a packet for resolution. It gets locked cache entry!
- */
-
-static int
-ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb)
+/* Queue a packet for resolution. It gets locked cache entry! */
+static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
+				  struct sk_buff *skb)
 {
+	struct mfc6_cache *c;
 	bool found = false;
 	int err;
-	struct mfc6_cache *c;
 
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
+	list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
 		if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
 		    ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) {
 			found = true;
@@ -1254,10 +1255,8 @@ ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb)
 			return -ENOBUFS;
 		}
 
-		/*
-		 *	Fill in the new cache entry
-		 */
-		c->mf6c_parent = -1;
+		/* Fill in the new cache entry */
+		c->_c.mfc_parent = -1;
 		c->mf6c_origin = ipv6_hdr(skb)->saddr;
 		c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;
 
@@ -1277,20 +1276,18 @@ ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb)
 		}
 
 		atomic_inc(&mrt->cache_resolve_queue_len);
-		list_add(&c->list, &mrt->mfc_unres_queue);
+		list_add(&c->_c.list, &mrt->mfc_unres_queue);
 		mr6_netlink_event(mrt, c, RTM_NEWROUTE);
 
 		ipmr_do_expire_process(mrt);
 	}
 
-	/*
-	 *	See if we can append the packet
-	 */
-	if (c->mfc_un.unres.unresolved.qlen > 3) {
+	/* See if we can append the packet */
+	if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
 		kfree_skb(skb);
 		err = -ENOBUFS;
 	} else {
-		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
+		skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
 		err = 0;
 	}
 
@@ -1314,8 +1311,8 @@ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc,
 	rcu_read_unlock();
 	if (!c)
 		return -ENOENT;
-	rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
-	list_del_rcu(&c->list);
+	rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params);
+	list_del_rcu(&c->_c.list);
 
 	mr6_netlink_event(mrt, c, RTM_DELROUTE);
 	ip6mr_cache_free(c);
@@ -1454,6 +1451,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 {
 	unsigned char ttls[MAXMIFS];
 	struct mfc6_cache *uc, *c;
+	struct mr_mfc *_uc;
 	bool found;
 	int i, err;
 
@@ -1473,10 +1471,10 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 	rcu_read_unlock();
 	if (c) {
 		write_lock_bh(&mrt_lock);
-		c->mf6c_parent = mfc->mf6cc_parent;
-		ip6mr_update_thresholds(mrt, c, ttls);
+		c->_c.mfc_parent = mfc->mf6cc_parent;
+		ip6mr_update_thresholds(mrt, &c->_c, ttls);
 		if (!mrtsock)
-			c->mfc_flags |= MFC_STATIC;
+			c->_c.mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
 		mr6_netlink_event(mrt, c, RTM_NEWROUTE);
 		return 0;
@@ -1492,29 +1490,30 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 
 	c->mf6c_origin = mfc->mf6cc_origin.sin6_addr;
 	c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr;
-	c->mf6c_parent = mfc->mf6cc_parent;
-	ip6mr_update_thresholds(mrt, c, ttls);
+	c->_c.mfc_parent = mfc->mf6cc_parent;
+	ip6mr_update_thresholds(mrt, &c->_c, ttls);
 	if (!mrtsock)
-		c->mfc_flags |= MFC_STATIC;
+		c->_c.mfc_flags |= MFC_STATIC;
 
-	err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
+	err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
 				  ip6mr_rht_params);
 	if (err) {
 		pr_err("ip6mr: rhtable insert error %d\n", err);
 		ip6mr_cache_free(c);
 		return err;
 	}
-	list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
+	list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
 
 	/* Check to see if we resolved a queued list. If so we
 	 * need to send on the frames and tidy up.
 	 */
 	found = false;
 	spin_lock_bh(&mfc_unres_lock);
-	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
+	list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
+		uc = (struct mfc6_cache *)_uc;
 		if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
 		    ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
-			list_del(&uc->list);
+			list_del(&_uc->list);
 			atomic_dec(&mrt->cache_resolve_queue_len);
 			found = true;
 			break;
@@ -1538,7 +1537,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 
 static void mroute_clean_tables(struct mr_table *mrt, bool all)
 {
-	struct mfc6_cache *c, *tmp;
+	struct mr_mfc *c, *tmp;
 	LIST_HEAD(list);
 	int i;
 
@@ -1556,16 +1555,17 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 			continue;
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
 		list_del_rcu(&c->list);
-		mr6_netlink_event(mrt, c, RTM_DELROUTE);
-		ip6mr_cache_free(c);
+		mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
+		ip6mr_cache_free((struct mfc6_cache *)c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
-			mr6_netlink_event(mrt, c, RTM_DELROUTE);
-			ip6mr_destroy_unres(mrt, c);
+			mr6_netlink_event(mrt, (struct mfc6_cache *)c,
+					  RTM_DELROUTE);
+			ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
 		}
 		spin_unlock_bh(&mfc_unres_lock);
 	}
@@ -1899,9 +1899,9 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
 		rcu_read_lock();
 		c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
 		if (c) {
-			sr.pktcnt = c->mfc_un.res.pkt;
-			sr.bytecnt = c->mfc_un.res.bytes;
-			sr.wrong_if = c->mfc_un.res.wrong_if;
+			sr.pktcnt = c->_c.mfc_un.res.pkt;
+			sr.bytecnt = c->_c.mfc_un.res.bytes;
+			sr.wrong_if = c->_c.mfc_un.res.wrong_if;
 			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -1973,9 +1973,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
 		rcu_read_lock();
 		c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
 		if (c) {
-			sr.pktcnt = c->mfc_un.res.pkt;
-			sr.bytecnt = c->mfc_un.res.bytes;
-			sr.wrong_if = c->mfc_un.res.wrong_if;
+			sr.pktcnt = c->_c.mfc_un.res.pkt;
+			sr.bytecnt = c->_c.mfc_un.res.bytes;
+			sr.wrong_if = c->_c.mfc_un.res.wrong_if;
 			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -2089,18 +2089,18 @@ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
 }
 
 static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
-			   struct sk_buff *skb, struct mfc6_cache *cache)
+			   struct sk_buff *skb, struct mfc6_cache *c)
 {
 	int psend = -1;
 	int vif, ct;
 	int true_vifi = ip6mr_find_vif(mrt, skb->dev);
 
-	vif = cache->mf6c_parent;
-	cache->mfc_un.res.pkt++;
-	cache->mfc_un.res.bytes += skb->len;
-	cache->mfc_un.res.lastuse = jiffies;
+	vif = c->_c.mfc_parent;
+	c->_c.mfc_un.res.pkt++;
+	c->_c.mfc_un.res.bytes += skb->len;
+	c->_c.mfc_un.res.lastuse = jiffies;
 
-	if (ipv6_addr_any(&cache->mf6c_origin) && true_vifi >= 0) {
+	if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) {
 		struct mfc6_cache *cache_proxy;
 
 		/* For an (*,G) entry, we only check that the incoming
@@ -2109,7 +2109,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 		rcu_read_lock();
 		cache_proxy = ip6mr_cache_find_any_parent(mrt, vif);
 		if (cache_proxy &&
-		    cache_proxy->mfc_un.res.ttls[true_vifi] < 255) {
+		    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) {
 			rcu_read_unlock();
 			goto forward;
 		}
@@ -2120,7 +2120,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 	 * Wrong interface: drop packet and (maybe) send PIM assert.
 	 */
 	if (mrt->vif_table[vif].dev != skb->dev) {
-		cache->mfc_un.res.wrong_if++;
+		c->_c.mfc_un.res.wrong_if++;
 
 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
@@ -2129,10 +2129,11 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 		       large chunk of pimd to kernel. Ough... --ANK
 		     */
 		    (mrt->mroute_do_pim ||
-		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
+		     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
 		    time_after(jiffies,
-			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
-			cache->mfc_un.res.last_assert = jiffies;
+			       c->_c.mfc_un.res.last_assert +
+			       MFC_ASSERT_THRESH)) {
+			c->_c.mfc_un.res.last_assert = jiffies;
 			ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
 		}
 		goto dont_forward;
@@ -2145,36 +2146,38 @@ forward:
 	/*
 	 *	Forward the frame
 	 */
-	if (ipv6_addr_any(&cache->mf6c_origin) &&
-	    ipv6_addr_any(&cache->mf6c_mcastgrp)) {
+	if (ipv6_addr_any(&c->mf6c_origin) &&
+	    ipv6_addr_any(&c->mf6c_mcastgrp)) {
 		if (true_vifi >= 0 &&
-		    true_vifi != cache->mf6c_parent &&
+		    true_vifi != c->_c.mfc_parent &&
 		    ipv6_hdr(skb)->hop_limit >
-				cache->mfc_un.res.ttls[cache->mf6c_parent]) {
+				c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
 			/* It's an (*,*) entry and the packet is not coming from
 			 * the upstream: forward the packet to the upstream
 			 * only.
 			 */
-			psend = cache->mf6c_parent;
+			psend = c->_c.mfc_parent;
 			goto last_forward;
 		}
 		goto dont_forward;
 	}
-	for (ct = cache->mfc_un.res.maxvif - 1; ct >= cache->mfc_un.res.minvif; ct--) {
+	for (ct = c->_c.mfc_un.res.maxvif - 1;
+	     ct >= c->_c.mfc_un.res.minvif; ct--) {
 		/* For (*,G) entry, don't forward to the incoming interface */
-		if ((!ipv6_addr_any(&cache->mf6c_origin) || ct != true_vifi) &&
-		    ipv6_hdr(skb)->hop_limit > cache->mfc_un.res.ttls[ct]) {
+		if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) &&
+		    ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) {
 			if (psend != -1) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 				if (skb2)
-					ip6mr_forward2(net, mrt, skb2, cache, psend);
+					ip6mr_forward2(net, mrt, skb2,
+						       c, psend);
 			}
 			psend = ct;
 		}
 	}
 last_forward:
 	if (psend != -1) {
-		ip6mr_forward2(net, mrt, skb, cache, psend);
+		ip6mr_forward2(net, mrt, skb, c, psend);
 		return;
 	}
 
@@ -2252,21 +2255,22 @@ static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	int ct;
 
 	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mf6c_parent >= MAXMIFS) {
+	if (c->_c.mfc_parent >= MAXMIFS) {
 		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
 		return -ENOENT;
 	}
 
-	if (VIF_EXISTS(mrt, c->mf6c_parent) &&
+	if (VIF_EXISTS(mrt, c->_c.mfc_parent) &&
 	    nla_put_u32(skb, RTA_IIF,
-			mrt->vif_table[c->mf6c_parent].dev->ifindex) < 0)
+			mrt->vif_table[c->_c.mfc_parent].dev->ifindex) < 0)
 		return -EMSGSIZE;
 	mp_attr = nla_nest_start(skb, RTA_MULTIPATH);
 	if (!mp_attr)
 		return -EMSGSIZE;
 
-	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+	for (ct = c->_c.mfc_un.res.minvif;
+	     ct < c->_c.mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->_c.mfc_un.res.ttls[ct] < 255) {
 			nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
 			if (!nhp) {
 				nla_nest_cancel(skb, mp_attr);
@@ -2274,7 +2278,7 @@ static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 			}
 
 			nhp->rtnh_flags = 0;
-			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+			nhp->rtnh_hops = c->_c.mfc_un.res.ttls[ct];
 			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
 			nhp->rtnh_len = sizeof(*nhp);
 		}
@@ -2282,12 +2286,12 @@ static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 
 	nla_nest_end(skb, mp_attr);
 
-	lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+	lastuse = READ_ONCE(c->_c.mfc_un.res.lastuse);
 	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
 
-	mfcs.mfcs_packets = c->mfc_un.res.pkt;
-	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
-	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+	mfcs.mfcs_packets = c->_c.mfc_un.res.pkt;
+	mfcs.mfcs_bytes = c->_c.mfc_un.res.bytes;
+	mfcs.mfcs_wrong_if = c->_c.mfc_un.res.wrong_if;
 	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
 	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
 			      RTA_PAD))
@@ -2363,7 +2367,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 	}
 
 	if (rtm->rtm_flags & RTM_F_NOTIFY)
-		cache->mfc_flags |= MFC_NOTIFY;
+		cache->_c.mfc_flags |= MFC_NOTIFY;
 
 	err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
 	read_unlock(&mrt_lock);
@@ -2392,7 +2396,7 @@ static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 		goto nla_put_failure;
 	rtm->rtm_type = RTN_MULTICAST;
 	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
-	if (c->mfc_flags & MFC_STATIC)
+	if (c->_c.mfc_flags & MFC_STATIC)
 		rtm->rtm_protocol = RTPROT_STATIC;
 	else
 		rtm->rtm_protocol = RTPROT_MROUTED;
@@ -2442,7 +2446,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
-	skb = nlmsg_new(mr6_msgsize(mfc->mf6c_parent >= MAXMIFS, mrt->maxvif),
+	skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif),
 			GFP_ATOMIC);
 	if (!skb)
 		goto errout;
@@ -2528,10 +2532,10 @@ errout:
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct mr_table *mrt;
-	struct mfc6_cache *mfc;
 	unsigned int t = 0, s_t;
 	unsigned int e = 0, s_e;
+	struct mr_table *mrt;
+	struct mr_mfc *mfc;
 
 	s_t = cb->args[0];
 	s_e = cb->args[1];
@@ -2546,8 +2550,8 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 			if (ip6mr_fill_mroute(mrt, skb,
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
-					      mfc, RTM_NEWROUTE,
-					      NLM_F_MULTI) < 0)
+					      (struct mfc6_cache *)mfc,
+					      RTM_NEWROUTE, NLM_F_MULTI) < 0)
 				goto done;
 next_entry:
 			e++;
@@ -2562,8 +2566,8 @@ next_entry:
 			if (ip6mr_fill_mroute(mrt, skb,
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
-					      mfc, RTM_NEWROUTE,
-					      NLM_F_MULTI) < 0) {
+					     (struct mfc6_cache *)mfc,
+					      RTM_NEWROUTE, NLM_F_MULTI) < 0) {
 				spin_unlock_bh(&mfc_unres_lock);
 				goto done;
 			}
-- 
cgit v1.2.3


From 845c9a7ae7f5342ba42280c3a2f2aa92bce641d7 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:35 +0200
Subject: ipmr, ip6mr: Unite logic for searching in MFC cache

ipmr and ip6mr utilize the exact same methods for searching the
hashed resolved connections, difference being only in the construction
of the hash comparison key.

In order to unite the flow, introduce an mr_table operation set that
would contain the protocol specific information required for common
flows, in this case - the hash parameters and a comparison key
representing a (*,*) route.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c      | 71 ++++++++++++-------------------------------------
 net/ipv4/ipmr_base.c | 54 ++++++++++++++++++++++++++++++++++++--
 net/ipv6/ip6mr.c     | 74 ++++++++++++----------------------------------------
 3 files changed, 85 insertions(+), 114 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 3f7515086e85..00898c319cc5 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -360,6 +360,16 @@ static void ipmr_new_table_set(struct mr_table *mrt,
 #endif
 }
 
+static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
+	.mfc_mcastgrp = htonl(INADDR_ANY),
+	.mfc_origin = htonl(INADDR_ANY),
+};
+
+static struct mr_table_ops ipmr_mr_table_ops = {
+	.rht_params = &ipmr_rht_params,
+	.cmparg_any = &ipmr_mr_table_ops_cmparg_any,
+};
+
 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
@@ -372,7 +382,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	if (mrt)
 		return mrt;
 
-	return mr_table_alloc(net, id, &ipmr_rht_params,
+	return mr_table_alloc(net, id, &ipmr_mr_table_ops,
 			      ipmr_expire_process, ipmr_new_table_set);
 }
 
@@ -973,33 +983,8 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 			.mfc_mcastgrp = mcastgrp,
 			.mfc_origin = origin
 	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
-
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		return (struct mfc_cache *)c;
-
-	return NULL;
-}
-
-/* Look for a (*,*,oif) entry */
-static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
-						    int vifi)
-{
-	struct mfc_cache_cmp_arg arg = {
-			.mfc_mcastgrp = htonl(INADDR_ANY),
-			.mfc_origin = htonl(INADDR_ANY)
-	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
-
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		if (c->mfc_un.res.ttls[vifi] < 255)
-			return (struct mfc_cache *)c;
 
-	return NULL;
+	return mr_mfc_find(mrt, &arg);
 }
 
 /* Look for a (*,G) entry */
@@ -1010,27 +995,10 @@ static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
 			.mfc_mcastgrp = mcastgrp,
 			.mfc_origin = htonl(INADDR_ANY)
 	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
 
 	if (mcastgrp == htonl(INADDR_ANY))
-		goto skip;
-
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
-		struct mfc_cache *proxy;
-
-		if (c->mfc_un.res.ttls[vifi] < 255)
-			return (struct mfc_cache *)c;
-
-		/* It's ok if the vifi is part of the static tree */
-		proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
-		if (proxy && proxy->_c.mfc_un.res.ttls[vifi] < 255)
-			return (struct mfc_cache *)c;
-	}
-
-skip:
-	return ipmr_cache_find_any_parent(mrt, vifi);
+		return mr_mfc_find_any_parent(mrt, vifi);
+	return mr_mfc_find_any(mrt, vifi, &arg);
 }
 
 /* Look for a (S,G,iif) entry if parent != -1 */
@@ -1042,15 +1010,8 @@ static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
 			.mfc_mcastgrp = mcastgrp,
 			.mfc_origin = origin,
 	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
 
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		if (parent == -1 || parent == c->mfc_parent)
-			return (struct mfc_cache *)c;
-
-	return NULL;
+	return mr_mfc_find_parent(mrt, &arg, parent);
 }
 
 /* Allocate a multicast cache entry */
@@ -2010,7 +1971,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 		/* For an (*,G) entry, we only check that the incomming
 		 * interface is part of the static tree.
 		 */
-		cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
+		cache_proxy = mr_mfc_find_any_parent(mrt, vif);
 		if (cache_proxy &&
 		    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
 			goto forward;
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 3e21a5806d0e..172f92acffef 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -29,7 +29,7 @@ EXPORT_SYMBOL(vif_device_init);
 
 struct mr_table *
 mr_table_alloc(struct net *net, u32 id,
-	       const struct rhashtable_params *rht_params,
+	       struct mr_table_ops *ops,
 	       void (*expire_func)(struct timer_list *t),
 	       void (*table_set)(struct mr_table *mrt,
 				 struct net *net))
@@ -42,7 +42,8 @@ mr_table_alloc(struct net *net, u32 id,
 	mrt->id = id;
 	write_pnet(&mrt->net, net);
 
-	rhltable_init(&mrt->mfc_hash, rht_params);
+	mrt->ops = *ops;
+	rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params);
 	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
@@ -53,3 +54,52 @@ mr_table_alloc(struct net *net, u32 id,
 	return mrt;
 }
 EXPORT_SYMBOL(mr_table_alloc);
+
+void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent)
+{
+	struct rhlist_head *tmp, *list;
+	struct mr_mfc *c;
+
+	list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (parent == -1 || parent == c->mfc_parent)
+			return c;
+
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_find_parent);
+
+void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi)
+{
+	struct rhlist_head *tmp, *list;
+	struct mr_mfc *c;
+
+	list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any,
+			       *mrt->ops.rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode)
+		if (c->mfc_un.res.ttls[vifi] < 255)
+			return c;
+
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_find_any_parent);
+
+void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg)
+{
+	struct rhlist_head *tmp, *list;
+	struct mr_mfc *c, *proxy;
+
+	list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params);
+	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
+		if (c->mfc_un.res.ttls[vifi] < 255)
+			return c;
+
+		/* It's ok if the vifi is part of the static tree */
+		proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent);
+		if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
+			return c;
+	}
+
+	return mr_mfc_find_any_parent(mrt, vifi);
+}
+EXPORT_SYMBOL(mr_mfc_find_any);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 3fe254f9f9b7..ea902a952fb6 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -302,6 +302,16 @@ static void ip6mr_new_table_set(struct mr_table *mrt,
 #endif
 }
 
+static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = {
+	.mf6c_origin = IN6ADDR_ANY_INIT,
+	.mf6c_mcastgrp = IN6ADDR_ANY_INIT,
+};
+
+static struct mr_table_ops ip6mr_mr_table_ops = {
+	.rht_params = &ip6mr_rht_params,
+	.cmparg_any = &ip6mr_mr_table_ops_cmparg_any,
+};
+
 static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
@@ -310,7 +320,7 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
 	if (mrt)
 		return mrt;
 
-	return mr_table_alloc(net, id, &ip6mr_rht_params,
+	return mr_table_alloc(net, id, &ip6mr_mr_table_ops,
 			      ipmr_expire_process, ip6mr_new_table_set);
 }
 
@@ -988,33 +998,8 @@ static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt,
 		.mf6c_origin = *origin,
 		.mf6c_mcastgrp = *mcastgrp,
 	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
-
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		return (struct mfc6_cache *)c;
 
-	return NULL;
-}
-
-/* Look for a (*,*,oif) entry */
-static struct mfc6_cache *ip6mr_cache_find_any_parent(struct mr_table *mrt,
-						      mifi_t mifi)
-{
-	struct mfc6_cache_cmp_arg arg = {
-		.mf6c_origin = in6addr_any,
-		.mf6c_mcastgrp = in6addr_any,
-	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
-
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		if (c->mfc_un.res.ttls[mifi] < 255)
-			return (struct mfc6_cache *)c;
-
-	return NULL;
+	return mr_mfc_find(mrt, &arg);
 }
 
 /* Look for a (*,G) entry */
@@ -1026,26 +1011,10 @@ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt,
 		.mf6c_origin = in6addr_any,
 		.mf6c_mcastgrp = *mcastgrp,
 	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
-	struct mfc6_cache *proxy;
 
 	if (ipv6_addr_any(mcastgrp))
-		goto skip;
-
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
-		if (c->mfc_un.res.ttls[mifi] < 255)
-			return (struct mfc6_cache *)c;
-
-		/* It's ok if the mifi is part of the static tree */
-		proxy = ip6mr_cache_find_any_parent(mrt, c->mfc_parent);
-		if (proxy && proxy->_c.mfc_un.res.ttls[mifi] < 255)
-			return (struct mfc6_cache *)c;
-	}
-
-skip:
-	return ip6mr_cache_find_any_parent(mrt, mifi);
+		return mr_mfc_find_any_parent(mrt, mifi);
+	return mr_mfc_find_any(mrt, mifi, &arg);
 }
 
 /* Look for a (S,G,iif) entry if parent != -1 */
@@ -1059,20 +1028,11 @@ ip6mr_cache_find_parent(struct mr_table *mrt,
 		.mf6c_origin = *origin,
 		.mf6c_mcastgrp = *mcastgrp,
 	};
-	struct rhlist_head *tmp, *list;
-	struct mr_mfc *c;
-
-	list = rhltable_lookup(&mrt->mfc_hash, &arg, ip6mr_rht_params);
-	rhl_for_each_entry_rcu(c, tmp, list, mnode)
-		if (parent == -1 || parent == c->mfc_parent)
-			return (struct mfc6_cache *)c;
 
-	return NULL;
+	return mr_mfc_find_parent(mrt, &arg, parent);
 }
 
-/*
- *	Allocate a multicast cache entry
- */
+/* Allocate a multicast cache entry */
 static struct mfc6_cache *ip6mr_cache_alloc(void)
 {
 	struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
@@ -2107,7 +2067,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 		 * interface is part of the static tree.
 		 */
 		rcu_read_lock();
-		cache_proxy = ip6mr_cache_find_any_parent(mrt, vif);
+		cache_proxy = mr_mfc_find_any_parent(mrt, vif);
 		if (cache_proxy &&
 		    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) {
 			rcu_read_unlock();
-- 
cgit v1.2.3


From c8d6196803265484f7e1cdd1b00a188dc59a5988 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:36 +0200
Subject: ipmr, ip6mr: Unite mfc seq logic

With the exception of the final dump, ipmr and ip6mr have the exact same
seq logic for traversing a given mr_table. Refactor that code and make
it common.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c      | 93 +++----------------------------------------------
 net/ipv4/ipmr_base.c | 62 +++++++++++++++++++++++++++++++++
 net/ipv6/ip6mr.c     | 97 ++++------------------------------------------------
 3 files changed, 74 insertions(+), 178 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 00898c319cc5..1eb19d9b1be0 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3014,41 +3014,8 @@ static const struct file_operations ipmr_vif_fops = {
 	.release = seq_release_net,
 };
 
-struct ipmr_mfc_iter {
-	struct seq_net_private p;
-	struct mr_table *mrt;
-	struct list_head *cache;
-};
-
-static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
-					  struct ipmr_mfc_iter *it, loff_t pos)
-{
-	struct mr_table *mrt = it->mrt;
-	struct mr_mfc *mfc;
-
-	rcu_read_lock();
-	it->cache = &mrt->mfc_cache_list;
-	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
-		if (pos-- == 0)
-			return (struct mfc_cache *)mfc;
-	rcu_read_unlock();
-
-	spin_lock_bh(&mfc_unres_lock);
-	it->cache = &mrt->mfc_unres_queue;
-	list_for_each_entry(mfc, it->cache, list)
-		if (pos-- == 0)
-			return (struct mfc_cache *)mfc;
-
-	spin_unlock_bh(&mfc_unres_lock);
-
-	it->cache = NULL;
-	return NULL;
-}
-
-
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt;
 
@@ -3056,57 +3023,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 	if (!mrt)
 		return ERR_PTR(-ENOENT);
 
-	it->mrt = mrt;
-	it->cache = NULL;
-	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
-		: SEQ_START_TOKEN;
-}
-
-static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct ipmr_mfc_iter *it = seq->private;
-	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = it->mrt;
-	struct mfc_cache *mfc = v;
-
-	++*pos;
-
-	if (v == SEQ_START_TOKEN)
-		return ipmr_mfc_seq_idx(net, seq->private, 0);
-
-	if (mfc->_c.list.next != it->cache)
-		return (struct mfc_cache *)(list_entry(mfc->_c.list.next,
-						       struct mr_mfc, list));
-
-	if (it->cache == &mrt->mfc_unres_queue)
-		goto end_of_list;
-
-	/* exhausted cache_array, show unresolved */
-	rcu_read_unlock();
-	it->cache = &mrt->mfc_unres_queue;
-
-	spin_lock_bh(&mfc_unres_lock);
-	if (!list_empty(it->cache))
-		return (struct mfc_cache *)(list_first_entry(it->cache,
-							     struct mr_mfc,
-							     list));
-
-end_of_list:
-	spin_unlock_bh(&mfc_unres_lock);
-	it->cache = NULL;
-
-	return NULL;
-}
-
-static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
-{
-	struct ipmr_mfc_iter *it = seq->private;
-	struct mr_table *mrt = it->mrt;
-
-	if (it->cache == &mrt->mfc_unres_queue)
-		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &mrt->mfc_cache_list)
-		rcu_read_unlock();
+	return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
 }
 
 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -3118,7 +3035,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
 	} else {
 		const struct mfc_cache *mfc = v;
-		const struct ipmr_mfc_iter *it = seq->private;
+		const struct mr_mfc_iter *it = seq->private;
 		const struct mr_table *mrt = it->mrt;
 
 		seq_printf(seq, "%08X %08X %-3hd",
@@ -3152,15 +3069,15 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 
 static const struct seq_operations ipmr_mfc_seq_ops = {
 	.start = ipmr_mfc_seq_start,
-	.next  = ipmr_mfc_seq_next,
-	.stop  = ipmr_mfc_seq_stop,
+	.next  = mr_mfc_seq_next,
+	.stop  = mr_mfc_seq_stop,
 	.show  = ipmr_mfc_seq_show,
 };
 
 static int ipmr_mfc_open(struct inode *inode, struct file *file)
 {
 	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
-			    sizeof(struct ipmr_mfc_iter));
+			    sizeof(struct mr_mfc_iter));
 }
 
 static const struct file_operations ipmr_mfc_fops = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 172f92acffef..37ad0a793035 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -103,3 +103,65 @@ void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg)
 	return mr_mfc_find_any_parent(mrt, vifi);
 }
 EXPORT_SYMBOL(mr_mfc_find_any);
+
+#ifdef CONFIG_PROC_FS
+void *mr_mfc_seq_idx(struct net *net,
+		     struct mr_mfc_iter *it, loff_t pos)
+{
+	struct mr_table *mrt = it->mrt;
+	struct mr_mfc *mfc;
+
+	rcu_read_lock();
+	it->cache = &mrt->mfc_cache_list;
+	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+		if (pos-- == 0)
+			return mfc;
+	rcu_read_unlock();
+
+	spin_lock_bh(it->lock);
+	it->cache = &mrt->mfc_unres_queue;
+	list_for_each_entry(mfc, it->cache, list)
+		if (pos-- == 0)
+			return mfc;
+	spin_unlock_bh(it->lock);
+
+	it->cache = NULL;
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_seq_idx);
+
+void *mr_mfc_seq_next(struct seq_file *seq, void *v,
+		      loff_t *pos)
+{
+	struct mr_mfc_iter *it = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = it->mrt;
+	struct mr_mfc *c = v;
+
+	++*pos;
+
+	if (v == SEQ_START_TOKEN)
+		return mr_mfc_seq_idx(net, seq->private, 0);
+
+	if (c->list.next != it->cache)
+		return list_entry(c->list.next, struct mr_mfc, list);
+
+	if (it->cache == &mrt->mfc_unres_queue)
+		goto end_of_list;
+
+	/* exhausted cache_array, show unresolved */
+	rcu_read_unlock();
+	it->cache = &mrt->mfc_unres_queue;
+
+	spin_lock_bh(it->lock);
+	if (!list_empty(it->cache))
+		return list_first_entry(it->cache, struct mr_mfc, list);
+
+end_of_list:
+	spin_unlock_bh(it->lock);
+	it->cache = NULL;
+
+	return NULL;
+}
+EXPORT_SYMBOL(mr_mfc_seq_next);
+#endif
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index ea902a952fb6..26315065e7df 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -333,40 +333,8 @@ static void ip6mr_free_table(struct mr_table *mrt)
 }
 
 #ifdef CONFIG_PROC_FS
-
-struct ipmr_mfc_iter {
-	struct seq_net_private p;
-	struct mr_table *mrt;
-	struct list_head *cache;
-};
-
-
-static struct mfc6_cache *ipmr_mfc_seq_idx(struct net *net,
-					   struct ipmr_mfc_iter *it, loff_t pos)
-{
-	struct mr_table *mrt = it->mrt;
-	struct mr_mfc *mfc;
-
-	rcu_read_lock();
-	it->cache = &mrt->mfc_cache_list;
-	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
-		if (pos-- == 0)
-			return (struct mfc6_cache *)mfc;
-	rcu_read_unlock();
-
-	spin_lock_bh(&mfc_unres_lock);
-	it->cache = &mrt->mfc_unres_queue;
-	list_for_each_entry(mfc, it->cache, list)
-		if (pos-- == 0)
-			return (struct mfc6_cache *)mfc;
-	spin_unlock_bh(&mfc_unres_lock);
-
-	it->cache = NULL;
-	return NULL;
-}
-
-/*
- *	The /proc interfaces to multicast routing /proc/ip6_mr_cache /proc/ip6_mr_vif
+/* The /proc interfaces to multicast routing
+ * /proc/ip6_mr_cache /proc/ip6_mr_vif
  */
 
 struct ipmr_vif_iter {
@@ -476,7 +444,6 @@ static const struct file_operations ip6mr_vif_fops = {
 
 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct ipmr_mfc_iter *it = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt;
 
@@ -484,57 +451,7 @@ static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
 	if (!mrt)
 		return ERR_PTR(-ENOENT);
 
-	it->mrt = mrt;
-	it->cache = NULL;
-	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
-		: SEQ_START_TOKEN;
-}
-
-static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct mfc6_cache *mfc = v;
-	struct ipmr_mfc_iter *it = seq->private;
-	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = it->mrt;
-
-	++*pos;
-
-	if (v == SEQ_START_TOKEN)
-		return ipmr_mfc_seq_idx(net, seq->private, 0);
-
-	if (mfc->_c.list.next != it->cache)
-		return (struct mfc6_cache *)(list_entry(mfc->_c.list.next,
-							struct mr_mfc, list));
-
-	if (it->cache == &mrt->mfc_unres_queue)
-		goto end_of_list;
-
-	/* exhausted cache_array, show unresolved */
-	rcu_read_unlock();
-	it->cache = &mrt->mfc_unres_queue;
-
-	spin_lock_bh(&mfc_unres_lock);
-	if (!list_empty(it->cache))
-		return (struct mfc6_cache *)(list_first_entry(it->cache,
-							      struct mr_mfc,
-							      list));
-
- end_of_list:
-	spin_unlock_bh(&mfc_unres_lock);
-	it->cache = NULL;
-
-	return NULL;
-}
-
-static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
-{
-	struct ipmr_mfc_iter *it = seq->private;
-	struct mr_table *mrt = it->mrt;
-
-	if (it->cache == &mrt->mfc_unres_queue)
-		spin_unlock_bh(&mfc_unres_lock);
-	else if (it->cache == &mrt->mfc_cache_list)
-		rcu_read_unlock();
+	return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
 }
 
 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -548,7 +465,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 			 "Iif      Pkts  Bytes     Wrong  Oifs\n");
 	} else {
 		const struct mfc6_cache *mfc = v;
-		const struct ipmr_mfc_iter *it = seq->private;
+		const struct mr_mfc_iter *it = seq->private;
 		struct mr_table *mrt = it->mrt;
 
 		seq_printf(seq, "%pI6 %pI6 %-3hd",
@@ -581,15 +498,15 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 
 static const struct seq_operations ipmr_mfc_seq_ops = {
 	.start = ipmr_mfc_seq_start,
-	.next  = ipmr_mfc_seq_next,
-	.stop  = ipmr_mfc_seq_stop,
+	.next  = mr_mfc_seq_next,
+	.stop  = mr_mfc_seq_stop,
 	.show  = ipmr_mfc_seq_show,
 };
 
 static int ipmr_mfc_open(struct inode *inode, struct file *file)
 {
 	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
-			    sizeof(struct ipmr_mfc_iter));
+			    sizeof(struct mr_mfc_iter));
 }
 
 static const struct file_operations ip6mr_mfc_fops = {
-- 
cgit v1.2.3


From 3feda6b46f734704840685a62b645cbe4efb810c Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:37 +0200
Subject: ipmr, ip6mr: Unite vif seq functions

Same as previously done with the mfc seq, the logic for the vif seq is
refactored to be shared between ipmr and ip6mr.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c      | 49 +++++--------------------------------------------
 net/ipv4/ipmr_base.c | 33 +++++++++++++++++++++++++++++++++
 net/ipv6/ip6mr.c     | 50 +++++---------------------------------------------
 3 files changed, 43 insertions(+), 89 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1eb19d9b1be0..f5ff54297824 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2908,31 +2908,11 @@ out:
 /* The /proc interfaces to multicast routing :
  * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
  */
-struct ipmr_vif_iter {
-	struct seq_net_private p;
-	struct mr_table *mrt;
-	int ct;
-};
-
-static struct vif_device *ipmr_vif_seq_idx(struct net *net,
-					    struct ipmr_vif_iter *iter,
-					    loff_t pos)
-{
-	struct mr_table *mrt = iter->mrt;
-
-	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
-		if (!VIF_EXISTS(mrt, iter->ct))
-			continue;
-		if (pos-- == 0)
-			return &mrt->vif_table[iter->ct];
-	}
-	return NULL;
-}
 
 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(mrt_lock)
 {
-	struct ipmr_vif_iter *iter = seq->private;
+	struct mr_vif_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt;
 
@@ -2943,26 +2923,7 @@ static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
 	iter->mrt = mrt;
 
 	read_lock(&mrt_lock);
-	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
-		: SEQ_START_TOKEN;
-}
-
-static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct ipmr_vif_iter *iter = seq->private;
-	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = iter->mrt;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return ipmr_vif_seq_idx(net, iter, 0);
-
-	while (++iter->ct < mrt->maxvif) {
-		if (!VIF_EXISTS(mrt, iter->ct))
-			continue;
-		return &mrt->vif_table[iter->ct];
-	}
-	return NULL;
+	return mr_vif_seq_start(seq, pos);
 }
 
 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
@@ -2973,7 +2934,7 @@ static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
 
 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 {
-	struct ipmr_vif_iter *iter = seq->private;
+	struct mr_vif_iter *iter = seq->private;
 	struct mr_table *mrt = iter->mrt;
 
 	if (v == SEQ_START_TOKEN) {
@@ -2996,7 +2957,7 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
 
 static const struct seq_operations ipmr_vif_seq_ops = {
 	.start = ipmr_vif_seq_start,
-	.next  = ipmr_vif_seq_next,
+	.next  = mr_vif_seq_next,
 	.stop  = ipmr_vif_seq_stop,
 	.show  = ipmr_vif_seq_show,
 };
@@ -3004,7 +2965,7 @@ static const struct seq_operations ipmr_vif_seq_ops = {
 static int ipmr_vif_open(struct inode *inode, struct file *file)
 {
 	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
-			    sizeof(struct ipmr_vif_iter));
+			    sizeof(struct mr_vif_iter));
 }
 
 static const struct file_operations ipmr_vif_fops = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 37ad0a793035..e1b7b639e9b1 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -105,6 +105,39 @@ void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg)
 EXPORT_SYMBOL(mr_mfc_find_any);
 
 #ifdef CONFIG_PROC_FS
+void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos)
+{
+	struct mr_table *mrt = iter->mrt;
+
+	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
+		if (!VIF_EXISTS(mrt, iter->ct))
+			continue;
+		if (pos-- == 0)
+			return &mrt->vif_table[iter->ct];
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(mr_vif_seq_idx);
+
+void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct mr_vif_iter *iter = seq->private;
+	struct net *net = seq_file_net(seq);
+	struct mr_table *mrt = iter->mrt;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return mr_vif_seq_idx(net, iter, 0);
+
+	while (++iter->ct < mrt->maxvif) {
+		if (!VIF_EXISTS(mrt, iter->ct))
+			continue;
+		return &mrt->vif_table[iter->ct];
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(mr_vif_seq_next);
+
 void *mr_mfc_seq_idx(struct net *net,
 		     struct mr_mfc_iter *it, loff_t pos)
 {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 26315065e7df..ddd9e6bba499 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -337,31 +337,10 @@ static void ip6mr_free_table(struct mr_table *mrt)
  * /proc/ip6_mr_cache /proc/ip6_mr_vif
  */
 
-struct ipmr_vif_iter {
-	struct seq_net_private p;
-	struct mr_table *mrt;
-	int ct;
-};
-
-static struct vif_device *ip6mr_vif_seq_idx(struct net *net,
-					    struct ipmr_vif_iter *iter,
-					    loff_t pos)
-{
-	struct mr_table *mrt = iter->mrt;
-
-	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
-		if (!VIF_EXISTS(mrt, iter->ct))
-			continue;
-		if (pos-- == 0)
-			return &mrt->vif_table[iter->ct];
-	}
-	return NULL;
-}
-
 static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(mrt_lock)
 {
-	struct ipmr_vif_iter *iter = seq->private;
+	struct mr_vif_iter *iter = seq->private;
 	struct net *net = seq_file_net(seq);
 	struct mr_table *mrt;
 
@@ -372,26 +351,7 @@ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
 	iter->mrt = mrt;
 
 	read_lock(&mrt_lock);
-	return *pos ? ip6mr_vif_seq_idx(net, seq->private, *pos - 1)
-		: SEQ_START_TOKEN;
-}
-
-static void *ip6mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct ipmr_vif_iter *iter = seq->private;
-	struct net *net = seq_file_net(seq);
-	struct mr_table *mrt = iter->mrt;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return ip6mr_vif_seq_idx(net, iter, 0);
-
-	while (++iter->ct < mrt->maxvif) {
-		if (!VIF_EXISTS(mrt, iter->ct))
-			continue;
-		return &mrt->vif_table[iter->ct];
-	}
-	return NULL;
+	return mr_vif_seq_start(seq, pos);
 }
 
 static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
@@ -402,7 +362,7 @@ static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
 
 static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
 {
-	struct ipmr_vif_iter *iter = seq->private;
+	struct mr_vif_iter *iter = seq->private;
 	struct mr_table *mrt = iter->mrt;
 
 	if (v == SEQ_START_TOKEN) {
@@ -424,7 +384,7 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
 
 static const struct seq_operations ip6mr_vif_seq_ops = {
 	.start = ip6mr_vif_seq_start,
-	.next  = ip6mr_vif_seq_next,
+	.next  = mr_vif_seq_next,
 	.stop  = ip6mr_vif_seq_stop,
 	.show  = ip6mr_vif_seq_show,
 };
@@ -432,7 +392,7 @@ static const struct seq_operations ip6mr_vif_seq_ops = {
 static int ip6mr_vif_open(struct inode *inode, struct file *file)
 {
 	return seq_open_net(inode, file, &ip6mr_vif_seq_ops,
-			    sizeof(struct ipmr_vif_iter));
+			    sizeof(struct mr_vif_iter));
 }
 
 static const struct file_operations ip6mr_vif_fops = {
-- 
cgit v1.2.3


From 889cd83cbe411dda854429f3223ab2d31a860a4a Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:38 +0200
Subject: ip6mr: Remove MFC_NOTIFY and refactor flags

MFC_NOTIFY exists in ip6mr, probably as some legacy code
[was already removed for ipmr in commit
06bd6c0370bb ("net: ipmr: remove unused MFC_NOTIFY flag and make the flags enum").
Remove it from ip6mr as well, and move the enum into a common file;
Notice MFC_OFFLOAD is currently only used by ipmr.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index ddd9e6bba499..c3b3f1c381e1 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2203,9 +2203,6 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		return err;
 	}
 
-	if (rtm->rtm_flags & RTM_F_NOTIFY)
-		cache->_c.mfc_flags |= MFC_NOTIFY;
-
 	err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
 	read_unlock(&mrt_lock);
 	return err;
-- 
cgit v1.2.3


From 7b0db85737db3f4d76b2a412e4f19eae59b8b494 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Wed, 28 Feb 2018 23:29:39 +0200
Subject: ipmr, ip6mr: Unite dumproute flows

The various MFC entries are being held in the same kind of mr_tables
for both ipmr and ip6mr, and their traversal logic is identical.
Also, with the exception of the addresses [and other small tidbits]
the major bulk of the nla setting is identical.

Unite as much of the dumping as possible between the two.
Notice this requires creating an mr_table iterator for each, as the
for-each preprocessor macro can't be used by the common logic.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c      | 161 +++++++++++++--------------------------------------
 net/ipv4/ipmr_base.c | 123 +++++++++++++++++++++++++++++++++++++++
 net/ipv6/ip6mr.c     | 156 +++++++++++++------------------------------------
 3 files changed, 201 insertions(+), 239 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index f5ff54297824..d752a70855d8 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -105,8 +105,6 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
 			  struct mfc_cache *cache, int local);
 static int ipmr_cache_report(struct mr_table *mrt,
 			     struct sk_buff *pkt, vifi_t vifi, int assert);
-static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-			      struct mr_mfc *c, struct rtmsg *rtm);
 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 				 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
@@ -117,6 +115,23 @@ static void ipmr_expire_process(struct timer_list *t);
 #define ipmr_for_each_table(mrt, net) \
 	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+					   struct mr_table *mrt)
+{
+	struct mr_table *ret;
+
+	if (!mrt)
+		ret = list_entry_rcu(net->ipv4.mr_tables.next,
+				     struct mr_table, list);
+	else
+		ret = list_entry_rcu(mrt->list.next,
+				     struct mr_table, list);
+
+	if (&ret->list == &net->ipv4.mr_tables)
+		return NULL;
+	return ret;
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
@@ -284,6 +299,14 @@ EXPORT_SYMBOL(ipmr_rule_default);
 #define ipmr_for_each_table(mrt, net) \
 	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
 
+static struct mr_table *ipmr_mr_table_iter(struct net *net,
+					   struct mr_table *mrt)
+{
+	if (!mrt)
+		return net->ipv4.mrt;
+	return NULL;
+}
+
 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
 {
 	return net->ipv4.mrt;
@@ -1051,8 +1074,8 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 			struct nlmsghdr *nlh = skb_pull(skb,
 							sizeof(struct iphdr));
 
-			if (__ipmr_fill_mroute(mrt, skb, &c->_c,
-					       nlmsg_data(nlh)) > 0) {
+			if (mr_fill_mroute(mrt, skb, &c->_c,
+					   nlmsg_data(nlh)) > 0) {
 				nlh->nlmsg_len = skb_tail_pointer(skb) -
 						 (u8 *)nlh;
 			} else {
@@ -2256,66 +2279,6 @@ drop:
 }
 #endif
 
-static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-			      struct mr_mfc *c, struct rtmsg *rtm)
-{
-	struct rta_mfc_stats mfcs;
-	struct nlattr *mp_attr;
-	struct rtnexthop *nhp;
-	unsigned long lastuse;
-	int ct;
-
-	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->mfc_parent >= MAXVIFS) {
-		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
-		return -ENOENT;
-	}
-
-	if (VIF_EXISTS(mrt, c->mfc_parent) &&
-	    nla_put_u32(skb, RTA_IIF,
-			mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
-		return -EMSGSIZE;
-
-	if (c->mfc_flags & MFC_OFFLOAD)
-		rtm->rtm_flags |= RTNH_F_OFFLOAD;
-
-	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
-		return -EMSGSIZE;
-
-	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
-			struct vif_device *vif;
-
-			if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
-				nla_nest_cancel(skb, mp_attr);
-				return -EMSGSIZE;
-			}
-
-			nhp->rtnh_flags = 0;
-			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
-			vif = &mrt->vif_table[ct];
-			nhp->rtnh_ifindex = vif->dev->ifindex;
-			nhp->rtnh_len = sizeof(*nhp);
-		}
-	}
-
-	nla_nest_end(skb, mp_attr);
-
-	lastuse = READ_ONCE(c->mfc_un.res.lastuse);
-	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
-
-	mfcs.mfcs_packets = c->mfc_un.res.pkt;
-	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
-	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
-	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
-	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
-			      RTA_PAD))
-		return -EMSGSIZE;
-
-	rtm->rtm_type = RTN_MULTICAST;
-	return 1;
-}
-
 int ipmr_get_route(struct net *net, struct sk_buff *skb,
 		   __be32 saddr, __be32 daddr,
 		   struct rtmsg *rtm, u32 portid)
@@ -2373,7 +2336,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
 	}
 
 	read_lock(&mrt_lock);
-	err = __ipmr_fill_mroute(mrt, skb, &cache->_c, rtm);
+	err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
 	read_unlock(&mrt_lock);
 	rcu_read_unlock();
 	return err;
@@ -2410,7 +2373,7 @@ static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
 	    nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
 		goto nla_put_failure;
-	err = __ipmr_fill_mroute(mrt, skb, &c->_c, rtm);
+	err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
 	/* do not break the dump if cache is unresolved */
 	if (err < 0 && err != -ENOENT)
 		goto nla_put_failure;
@@ -2423,6 +2386,14 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+			     u32 portid, u32 seq, struct mr_mfc *c, int cmd,
+			     int flags)
+{
+	return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c,
+				cmd, flags);
+}
+
 static size_t mroute_msgsize(bool unresolved, int maxvif)
 {
 	size_t len =
@@ -2596,62 +2567,8 @@ errout_free:
 
 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct net *net = sock_net(skb->sk);
-	unsigned int t = 0, s_t;
-	unsigned int e = 0, s_e;
-	struct mr_table *mrt;
-	struct mr_mfc *mfc;
-
-	s_t = cb->args[0];
-	s_e = cb->args[1];
-
-	rcu_read_lock();
-	ipmr_for_each_table(mrt, net) {
-		if (t < s_t)
-			goto next_table;
-		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
-			if (e < s_e)
-				goto next_entry;
-			if (ipmr_fill_mroute(mrt, skb,
-					     NETLINK_CB(cb->skb).portid,
-					     cb->nlh->nlmsg_seq,
-					     (struct mfc_cache *)mfc,
-					     RTM_NEWROUTE, NLM_F_MULTI) < 0)
-				goto done;
-next_entry:
-			e++;
-		}
-		e = 0;
-		s_e = 0;
-
-		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
-			if (e < s_e)
-				goto next_entry2;
-			if (ipmr_fill_mroute(mrt, skb,
-					     NETLINK_CB(cb->skb).portid,
-					     cb->nlh->nlmsg_seq,
-					     (struct mfc_cache *)mfc,
-					     RTM_NEWROUTE, NLM_F_MULTI) < 0) {
-				spin_unlock_bh(&mfc_unres_lock);
-				goto done;
-			}
-next_entry2:
-			e++;
-		}
-		spin_unlock_bh(&mfc_unres_lock);
-		e = 0;
-		s_e = 0;
-next_table:
-		t++;
-	}
-done:
-	rcu_read_unlock();
-
-	cb->args[1] = e;
-	cb->args[0] = t;
-
-	return skb->len;
+	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
+				_ipmr_fill_mroute, &mfc_unres_lock);
 }
 
 static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index e1b7b639e9b1..8ba55bfda817 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -198,3 +198,126 @@ end_of_list:
 }
 EXPORT_SYMBOL(mr_mfc_seq_next);
 #endif
+
+int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+		   struct mr_mfc *c, struct rtmsg *rtm)
+{
+	struct rta_mfc_stats mfcs;
+	struct nlattr *mp_attr;
+	struct rtnexthop *nhp;
+	unsigned long lastuse;
+	int ct;
+
+	/* If cache is unresolved, don't try to parse IIF and OIF */
+	if (c->mfc_parent >= MAXVIFS) {
+		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
+		return -ENOENT;
+	}
+
+	if (VIF_EXISTS(mrt, c->mfc_parent) &&
+	    nla_put_u32(skb, RTA_IIF,
+			mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+		return -EMSGSIZE;
+
+	if (c->mfc_flags & MFC_OFFLOAD)
+		rtm->rtm_flags |= RTNH_F_OFFLOAD;
+
+	mp_attr = nla_nest_start(skb, RTA_MULTIPATH);
+	if (!mp_attr)
+		return -EMSGSIZE;
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
+			struct vif_device *vif;
+
+			nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
+			if (!nhp) {
+				nla_nest_cancel(skb, mp_attr);
+				return -EMSGSIZE;
+			}
+
+			nhp->rtnh_flags = 0;
+			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+			vif = &mrt->vif_table[ct];
+			nhp->rtnh_ifindex = vif->dev->ifindex;
+			nhp->rtnh_len = sizeof(*nhp);
+		}
+	}
+
+	nla_nest_end(skb, mp_attr);
+
+	lastuse = READ_ONCE(c->mfc_un.res.lastuse);
+	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
+
+	mfcs.mfcs_packets = c->mfc_un.res.pkt;
+	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
+	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
+	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
+			      RTA_PAD))
+		return -EMSGSIZE;
+
+	rtm->rtm_type = RTN_MULTICAST;
+	return 1;
+}
+EXPORT_SYMBOL(mr_fill_mroute);
+
+int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
+		     struct mr_table *(*iter)(struct net *net,
+					      struct mr_table *mrt),
+		     int (*fill)(struct mr_table *mrt,
+				 struct sk_buff *skb,
+				 u32 portid, u32 seq, struct mr_mfc *c,
+				 int cmd, int flags),
+		     spinlock_t *lock)
+{
+	unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
+	struct net *net = sock_net(skb->sk);
+	struct mr_table *mrt;
+	struct mr_mfc *mfc;
+
+	rcu_read_lock();
+	for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
+		if (t < s_t)
+			goto next_table;
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+			if (e < s_e)
+				goto next_entry;
+			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+				 cb->nlh->nlmsg_seq, mfc,
+				 RTM_NEWROUTE, NLM_F_MULTI) < 0)
+				goto done;
+next_entry:
+			e++;
+		}
+		e = 0;
+		s_e = 0;
+
+		spin_lock_bh(lock);
+		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+			if (e < s_e)
+				goto next_entry2;
+			if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+				 cb->nlh->nlmsg_seq, mfc,
+				 RTM_NEWROUTE, NLM_F_MULTI) < 0) {
+				spin_unlock_bh(lock);
+				goto done;
+			}
+next_entry2:
+			e++;
+		}
+		spin_unlock_bh(lock);
+		e = 0;
+		s_e = 0;
+next_table:
+		t++;
+	}
+done:
+	rcu_read_unlock();
+
+	cb->args[1] = e;
+	cb->args[0] = t;
+
+	return skb->len;
+}
+EXPORT_SYMBOL(mr_rtm_dumproute);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index c3b3f1c381e1..2a38f9de45d3 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -87,8 +87,6 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
 			   struct sk_buff *skb, struct mfc6_cache *cache);
 static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
 			      mifi_t mifi, int assert);
-static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-			       struct mfc6_cache *c, struct rtmsg *rtm);
 static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
 			      int cmd);
 static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
@@ -101,6 +99,23 @@ static void ipmr_expire_process(struct timer_list *t);
 #define ip6mr_for_each_table(mrt, net) \
 	list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
 
+static struct mr_table *ip6mr_mr_table_iter(struct net *net,
+					    struct mr_table *mrt)
+{
+	struct mr_table *ret;
+
+	if (!mrt)
+		ret = list_entry_rcu(net->ipv6.mr6_tables.next,
+				     struct mr_table, list);
+	else
+		ret = list_entry_rcu(mrt->list.next,
+				     struct mr_table, list);
+
+	if (&ret->list == &net->ipv6.mr6_tables)
+		return NULL;
+	return ret;
+}
+
 static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
 {
 	struct mr_table *mrt;
@@ -247,6 +262,14 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 #define ip6mr_for_each_table(mrt, net) \
 	for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
 
+static struct mr_table *ip6mr_mr_table_iter(struct net *net,
+					    struct mr_table *mrt)
+{
+	if (!mrt)
+		return net->ipv6.mrt6;
+	return NULL;
+}
+
 static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
 {
 	return net->ipv6.mrt6;
@@ -948,7 +971,8 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
 			struct nlmsghdr *nlh = skb_pull(skb,
 							sizeof(struct ipv6hdr));
 
-			if (__ip6mr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
+			if (mr_fill_mroute(mrt, skb, &c->_c,
+					   nlmsg_data(nlh)) > 0) {
 				nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
 			} else {
 				nlh->nlmsg_type = NLMSG_ERROR;
@@ -2081,63 +2105,6 @@ int ip6_mr_input(struct sk_buff *skb)
 	return 0;
 }
 
-
-static int __ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
-			       struct mfc6_cache *c, struct rtmsg *rtm)
-{
-	struct rta_mfc_stats mfcs;
-	struct nlattr *mp_attr;
-	struct rtnexthop *nhp;
-	unsigned long lastuse;
-	int ct;
-
-	/* If cache is unresolved, don't try to parse IIF and OIF */
-	if (c->_c.mfc_parent >= MAXMIFS) {
-		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
-		return -ENOENT;
-	}
-
-	if (VIF_EXISTS(mrt, c->_c.mfc_parent) &&
-	    nla_put_u32(skb, RTA_IIF,
-			mrt->vif_table[c->_c.mfc_parent].dev->ifindex) < 0)
-		return -EMSGSIZE;
-	mp_attr = nla_nest_start(skb, RTA_MULTIPATH);
-	if (!mp_attr)
-		return -EMSGSIZE;
-
-	for (ct = c->_c.mfc_un.res.minvif;
-	     ct < c->_c.mfc_un.res.maxvif; ct++) {
-		if (VIF_EXISTS(mrt, ct) && c->_c.mfc_un.res.ttls[ct] < 255) {
-			nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
-			if (!nhp) {
-				nla_nest_cancel(skb, mp_attr);
-				return -EMSGSIZE;
-			}
-
-			nhp->rtnh_flags = 0;
-			nhp->rtnh_hops = c->_c.mfc_un.res.ttls[ct];
-			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
-			nhp->rtnh_len = sizeof(*nhp);
-		}
-	}
-
-	nla_nest_end(skb, mp_attr);
-
-	lastuse = READ_ONCE(c->_c.mfc_un.res.lastuse);
-	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
-
-	mfcs.mfcs_packets = c->_c.mfc_un.res.pkt;
-	mfcs.mfcs_bytes = c->_c.mfc_un.res.bytes;
-	mfcs.mfcs_wrong_if = c->_c.mfc_un.res.wrong_if;
-	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
-	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
-			      RTA_PAD))
-		return -EMSGSIZE;
-
-	rtm->rtm_type = RTN_MULTICAST;
-	return 1;
-}
-
 int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		    u32 portid)
 {
@@ -2203,7 +2170,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 		return err;
 	}
 
-	err = __ip6mr_fill_mroute(mrt, skb, cache, rtm);
+	err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
 	read_unlock(&mrt_lock);
 	return err;
 }
@@ -2239,7 +2206,7 @@ static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
 	if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) ||
 	    nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp))
 		goto nla_put_failure;
-	err = __ip6mr_fill_mroute(mrt, skb, c, rtm);
+	err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
 	/* do not break the dump if cache is unresolved */
 	if (err < 0 && err != -ENOENT)
 		goto nla_put_failure;
@@ -2252,6 +2219,14 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
+			      u32 portid, u32 seq, struct mr_mfc *c,
+			      int cmd, int flags)
+{
+	return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c,
+				 cmd, flags);
+}
+
 static int mr6_msgsize(bool unresolved, int maxvif)
 {
 	size_t len =
@@ -2365,59 +2340,6 @@ errout:
 
 static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct net *net = sock_net(skb->sk);
-	unsigned int t = 0, s_t;
-	unsigned int e = 0, s_e;
-	struct mr_table *mrt;
-	struct mr_mfc *mfc;
-
-	s_t = cb->args[0];
-	s_e = cb->args[1];
-
-	rcu_read_lock();
-	ip6mr_for_each_table(mrt, net) {
-		if (t < s_t)
-			goto next_table;
-		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
-			if (e < s_e)
-				goto next_entry;
-			if (ip6mr_fill_mroute(mrt, skb,
-					      NETLINK_CB(cb->skb).portid,
-					      cb->nlh->nlmsg_seq,
-					      (struct mfc6_cache *)mfc,
-					      RTM_NEWROUTE, NLM_F_MULTI) < 0)
-				goto done;
-next_entry:
-			e++;
-		}
-		e = 0;
-		s_e = 0;
-
-		spin_lock_bh(&mfc_unres_lock);
-		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
-			if (e < s_e)
-				goto next_entry2;
-			if (ip6mr_fill_mroute(mrt, skb,
-					      NETLINK_CB(cb->skb).portid,
-					      cb->nlh->nlmsg_seq,
-					     (struct mfc6_cache *)mfc,
-					      RTM_NEWROUTE, NLM_F_MULTI) < 0) {
-				spin_unlock_bh(&mfc_unres_lock);
-				goto done;
-			}
-next_entry2:
-			e++;
-		}
-		spin_unlock_bh(&mfc_unres_lock);
-		e = s_e = 0;
-next_table:
-		t++;
-	}
-done:
-	rcu_read_unlock();
-
-	cb->args[1] = e;
-	cb->args[0] = t;
-
-	return skb->len;
+	return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
+				_ip6mr_fill_mroute, &mfc_unres_lock);
 }
-- 
cgit v1.2.3


From 0f6271264afd975bc599d6f30f3693e9aea57036 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <stefan.raspl@de.ibm.com>
Date: Thu, 1 Mar 2018 13:51:26 +0100
Subject: net/smc: cleanup smc_llc.h and smc_clc.h headers

Remove structures used internal only from headers.
And remove an extra function parameter.

Signed-off-by: Stefan Raspl <raspl@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  |  8 ++++----
 net/smc/smc_clc.c |  3 +++
 net/smc/smc_clc.h |  6 ------
 net/smc/smc_llc.c | 30 ++++++++++++++++++++++++++++++
 net/smc/smc_llc.h | 28 ----------------------------
 5 files changed, 37 insertions(+), 38 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 38ae22b65e77..b1961a789837 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -312,7 +312,7 @@ out:
 	return rc;
 }
 
-static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
+static int smc_clnt_conf_first_link(struct smc_sock *smc)
 {
 	struct smc_link_group *lgr = smc->conn.lgr;
 	struct smc_link *link;
@@ -346,7 +346,8 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
 	/* send CONFIRM LINK response over RoCE fabric */
 	rc = smc_llc_send_confirm_link(link,
 				       link->smcibdev->mac[link->ibport - 1],
-				       gid, SMC_LLC_RESP);
+				       &link->smcibdev->gid[link->ibport - 1],
+				       SMC_LLC_RESP);
 	if (rc < 0)
 		return SMC_CLC_DECL_TCL;
 
@@ -498,8 +499,7 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 	if (local_contact == SMC_FIRST_CONTACT) {
 		/* QP confirmation over RoCE fabric */
-		reason_code = smc_clnt_conf_first_link(
-			smc, &smcibdev->gid[ibport - 1]);
+		reason_code = smc_clnt_conf_first_link(smc);
 		if (reason_code < 0) {
 			rc = reason_code;
 			goto out_err_unlock;
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 8ac51583a063..dff318a2d5bf 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -22,6 +22,9 @@
 #include "smc_clc.h"
 #include "smc_ib.h"
 
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+
 /* check if received message has a correct header length and contains valid
  * heading and trailing eyecatchers
  */
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index c145a0f36a68..aab3aae2a2ce 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -22,9 +22,6 @@
 #define SMC_CLC_CONFIRM		0x03
 #define SMC_CLC_DECLINE		0x04
 
-/* eye catcher "SMCR" EBCDIC for CLC messages */
-static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
-
 #define SMC_CLC_V1		0x1		/* SMC version                */
 #define CLC_WAIT_TIME		(6 * HZ)	/* max. wait time on clcsock  */
 #define SMC_CLC_DECL_MEM	0x01010000  /* insufficient memory resources  */
@@ -124,9 +121,6 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
 	       ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
 }
 
-struct smc_sock;
-struct smc_ib_device;
-
 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		     u8 expected_type);
 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 92fe4cc8c82c..e4502bbff33d 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -21,6 +21,36 @@
 #include "smc_clc.h"
 #include "smc_llc.h"
 
+#define SMC_LLC_DATA_LEN		40
+
+struct smc_llc_hdr {
+	struct smc_wr_rx_hdr common;
+	u8 length;	/* 44 */
+	u8 reserved;
+	u8 flags;
+};
+
+struct smc_llc_msg_confirm_link {	/* type 0x01 */
+	struct smc_llc_hdr hd;
+	u8 sender_mac[ETH_ALEN];
+	u8 sender_gid[SMC_GID_SIZE];
+	u8 sender_qp_num[3];
+	u8 link_num;
+	u8 link_uid[SMC_LGR_ID_SIZE];
+	u8 max_links;
+	u8 reserved[9];
+};
+
+union smc_llc_msg {
+	struct smc_llc_msg_confirm_link confirm_link;
+	struct {
+		struct smc_llc_hdr hdr;
+		u8 data[SMC_LLC_DATA_LEN];
+	} raw;
+};
+
+#define SMC_LLC_FLAG_RESP		0x80
+
 /********************************** send *************************************/
 
 struct smc_llc_tx_pend {
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 51b27ce90dbd..a7888607ab53 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -28,34 +28,6 @@ enum smc_llc_msg_type {
 	SMC_LLC_CONFIRM_LINK		= 0x01,
 };
 
-#define SMC_LLC_DATA_LEN		40
-
-struct smc_llc_hdr {
-	struct smc_wr_rx_hdr common;
-	u8 length;	/* 44 */
-	u8 reserved;
-	u8 flags;
-};
-
-struct smc_llc_msg_confirm_link {	/* type 0x01 */
-	struct smc_llc_hdr hd;
-	u8 sender_mac[ETH_ALEN];
-	u8 sender_gid[SMC_GID_SIZE];
-	u8 sender_qp_num[3];
-	u8 link_num;
-	u8 link_uid[SMC_LGR_ID_SIZE];
-	u8 max_links;
-	u8 reserved[9];
-};
-
-union smc_llc_msg {
-	struct smc_llc_msg_confirm_link confirm_link;
-	struct {
-		struct smc_llc_hdr hdr;
-		u8 data[SMC_LLC_DATA_LEN];
-	} raw;
-};
-
 /* transmit */
 int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
 			      enum smc_llc_reqresp reqresp);
-- 
cgit v1.2.3


From 696cd3016975d31e3499c49a7a747d7615a16b3b Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Thu, 1 Mar 2018 13:51:27 +0100
Subject: net/smc: move netinfo function to file smc_clc.c

The function smc_netinfo_by_tcpsk() belongs to CLC handling.
Move it to smc_clc.c and rename to smc_clc_netinfo_by_tcpsk.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  | 42 +-----------------------------------------
 net/smc/smc.h     |  2 --
 net/smc/smc_clc.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
 net/smc/smc_clc.h |  2 ++
 4 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b1961a789837..b90cbfdb9916 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -24,7 +24,6 @@
 
 #include <linux/module.h>
 #include <linux/socket.h>
-#include <linux/inetdevice.h>
 #include <linux/workqueue.h>
 #include <linux/in.h>
 #include <linux/sched/signal.h>
@@ -273,45 +272,6 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
 }
 
-/* determine subnet and mask of internal TCP socket */
-int smc_netinfo_by_tcpsk(struct socket *clcsock,
-			 __be32 *subnet, u8 *prefix_len)
-{
-	struct dst_entry *dst = sk_dst_get(clcsock->sk);
-	struct in_device *in_dev;
-	struct sockaddr_in addr;
-	int rc = -ENOENT;
-
-	if (!dst) {
-		rc = -ENOTCONN;
-		goto out;
-	}
-	if (!dst->dev) {
-		rc = -ENODEV;
-		goto out_rel;
-	}
-
-	/* get address to which the internal TCP socket is bound */
-	kernel_getsockname(clcsock, (struct sockaddr *)&addr);
-	/* analyze IPv4 specific data of net_device belonging to TCP socket */
-	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(dst->dev);
-	for_ifa(in_dev) {
-		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
-			continue;
-		*prefix_len = inet_mask_len(ifa->ifa_mask);
-		*subnet = ifa->ifa_address & ifa->ifa_mask;
-		rc = 0;
-		break;
-	} endfor_ifa(in_dev);
-	rcu_read_unlock();
-
-out_rel:
-	dst_release(dst);
-out:
-	return rc;
-}
-
 static int smc_clnt_conf_first_link(struct smc_sock *smc)
 {
 	struct smc_link_group *lgr = smc->conn.lgr;
@@ -808,7 +768,7 @@ static void smc_listen_work(struct work_struct *work)
 	}
 
 	/* determine subnet and mask from internal TCP socket */
-	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
+	rc = smc_clc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
 	if (rc) {
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 		goto decline_rdma;
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 9518986c97b1..9895c190d146 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -263,8 +263,6 @@ static inline bool using_ipsec(struct smc_sock *smc)
 
 struct smc_clc_msg_local;
 
-int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
-			 u8 *prefix_len);
 void smc_conn_free(struct smc_connection *conn);
 int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
 		    struct smc_ib_device *smcibdev, u8 ibport,
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index dff318a2d5bf..874c5a75d6dd 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/in.h>
+#include <linux/inetdevice.h>
 #include <linux/if_ether.h>
 #include <linux/sched/signal.h>
 
@@ -73,6 +74,45 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
 	return true;
 }
 
+/* determine subnet and mask of internal TCP socket */
+int smc_clc_netinfo_by_tcpsk(struct socket *clcsock,
+			     __be32 *subnet, u8 *prefix_len)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	struct in_device *in_dev;
+	struct sockaddr_in addr;
+	int rc = -ENOENT;
+
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+
+	/* get address to which the internal TCP socket is bound */
+	kernel_getsockname(clcsock, (struct sockaddr *)&addr);
+	/* analyze IPv4 specific data of net_device belonging to TCP socket */
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dst->dev);
+	for_ifa(in_dev) {
+		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
+			continue;
+		*prefix_len = inet_mask_len(ifa->ifa_mask);
+		*subnet = ifa->ifa_address & ifa->ifa_mask;
+		rc = 0;
+		break;
+	} endfor_ifa(in_dev);
+	rcu_read_unlock();
+
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
 /* Wait for data on the tcp-socket, analyze received data
  * Returns:
  * 0 if success and it was not a decline that we received.
@@ -214,8 +254,8 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 
 	memset(&pclc_prfx, 0, sizeof(pclc_prfx));
 	/* determine subnet and mask from internal TCP socket */
-	rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet,
-				  &pclc_prfx.prefix_len);
+	rc = smc_clc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet,
+				      &pclc_prfx.prefix_len);
 	if (rc)
 		return SMC_CLC_DECL_CNFERR; /* configuration error */
 	pclc_prfx.ipv6_prefixes_cnt = 0;
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index aab3aae2a2ce..a20fc75efb24 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -121,6 +121,8 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
 	       ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
 }
 
+int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
+			     u8 *prefix_len);
 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		     u8 expected_type);
 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
-- 
cgit v1.2.3


From be6d467b997f9e32aa9b27add06e7b0c8627a566 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Thu, 1 Mar 2018 13:51:28 +0100
Subject: net/smc: remove unused fields from smc structures

The daddr field holds the destination IPv4 address. The field was set but
never used and can be removed. The addr field was a left-over from an
earlier version of non-blocking connects and can be removed.
The result of the call to kernel_getpeername is not used, the call can be
removed. Non-blocking connects are working, so remove restriction comment.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 15 ++++-----------
 net/smc/smc.h      |  3 +--
 net/smc/smc_core.c |  7 +++----
 net/smc/smc_core.h |  1 -
 4 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index b90cbfdb9916..cda3d5314e3f 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -7,7 +7,6 @@
  *  applicable with RoCE-cards only
  *
  *  Initial restrictions:
- *    - non-blocking connect postponed
  *    - IPv6 support postponed
  *    - support for alternate links postponed
  *    - partial support for non-blocking sockets only
@@ -345,7 +344,6 @@ static void smc_lgr_forget(struct smc_link_group *lgr)
 /* setup for RDMA connection of client */
 static int smc_connect_rdma(struct smc_sock *smc)
 {
-	struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
 	struct smc_clc_msg_accept_confirm aclc;
 	int local_contact = SMC_FIRST_CONTACT;
 	struct smc_ib_device *smcibdev;
@@ -399,8 +397,8 @@ static int smc_connect_rdma(struct smc_sock *smc)
 
 	srv_first_contact = aclc.hdr.flag;
 	mutex_lock(&smc_create_lgr_pending);
-	local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
-					ibport, &aclc.lcl, srv_first_contact);
+	local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
+					srv_first_contact);
 	if (local_contact < 0) {
 		rc = local_contact;
 		if (rc == -ENOMEM)
@@ -518,7 +516,6 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 		goto out_err;
 	if (addr->sa_family != AF_INET)
 		goto out_err;
-	smc->addr = addr;	/* needed for nonblocking connect */
 
 	lock_sock(sk);
 	switch (sk->sk_state) {
@@ -726,7 +723,6 @@ static void smc_listen_work(struct work_struct *work)
 	struct sock *newsmcsk = &new_smc->sk;
 	struct smc_clc_msg_proposal *pclc;
 	struct smc_ib_device *smcibdev;
-	struct sockaddr_in peeraddr;
 	u8 buf[SMC_CLC_MAX_LEN];
 	struct smc_link *link;
 	int reason_code = 0;
@@ -782,13 +778,10 @@ static void smc_listen_work(struct work_struct *work)
 		goto decline_rdma;
 	}
 
-	/* get address of the peer connected to the internal TCP socket */
-	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr);
-
 	/* allocate connection / link group */
 	mutex_lock(&smc_create_lgr_pending);
-	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
-					smcibdev, ibport, &pclc->lcl, 0);
+	local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
+					0);
 	if (local_contact < 0) {
 		rc = local_contact;
 		if (rc == -ENOMEM)
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 9895c190d146..268cdf11533c 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -172,7 +172,6 @@ struct smc_sock {				/* smc sock container */
 	struct sock		sk;
 	struct socket		*clcsock;	/* internal tcp socket */
 	struct smc_connection	conn;		/* smc connection */
-	struct sockaddr		*addr;		/* inet connect address */
 	struct smc_sock		*listen_smc;	/* listen parent */
 	struct work_struct	tcp_listen_work;/* handle tcp socket accepts */
 	struct work_struct	smc_listen_work;/* prepare new accept socket */
@@ -264,7 +263,7 @@ static inline bool using_ipsec(struct smc_sock *smc)
 struct smc_clc_msg_local;
 
 void smc_conn_free(struct smc_connection *conn);
-int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+int smc_conn_create(struct smc_sock *smc,
 		    struct smc_ib_device *smcibdev, u8 ibport,
 		    struct smc_clc_msg_local *lcl, int srv_first_contact);
 struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 2424c7100aaf..bc11d06e38ae 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -144,7 +144,7 @@ free:
 }
 
 /* create a new SMC link group */
-static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
+static int smc_lgr_create(struct smc_sock *smc,
 			  struct smc_ib_device *smcibdev, u8 ibport,
 			  char *peer_systemid, unsigned short vlan_id)
 {
@@ -161,7 +161,6 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
 	}
 	lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
 	lgr->sync_err = false;
-	lgr->daddr = peer_in_addr;
 	memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
 	lgr->vlan_id = vlan_id;
 	rwlock_init(&lgr->sndbufs_lock);
@@ -400,7 +399,7 @@ static int smc_link_determine_gid(struct smc_link_group *lgr)
 }
 
 /* create a new SMC connection (and a new link group if necessary) */
-int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+int smc_conn_create(struct smc_sock *smc,
 		    struct smc_ib_device *smcibdev, u8 ibport,
 		    struct smc_clc_msg_local *lcl, int srv_first_contact)
 {
@@ -457,7 +456,7 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
 
 create:
 	if (local_contact == SMC_FIRST_CONTACT) {
-		rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
+		rc = smc_lgr_create(smc, smcibdev, ibport,
 				    lcl->id_for_peer, vlan_id);
 		if (rc)
 			goto out;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index fe691bf9af91..7852c3fabf12 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -124,7 +124,6 @@ struct smc_rtoken {				/* address/key of remote RMB */
 struct smc_link_group {
 	struct list_head	list;
 	enum smc_lgr_role	role;		/* client or server */
-	__be32			daddr;		/* destination ip address */
 	struct smc_link		lnk[SMC_LINKS_PER_LGR_MAX];	/* smc link */
 	char			peer_systemid[SMC_SYSTEMID_LEN];
 						/* unique system_id of peer */
-- 
cgit v1.2.3


From 313164da55da0fb24191e729f989f4b2c2793ead Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Thu, 1 Mar 2018 13:51:29 +0100
Subject: net/smc: respond to test link messages

Add TEST LINK message responses, which also serves as preparation for
support of sockopt TCP_KEEPALIVE.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_llc.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/smc/smc_llc.h |  3 +++
 2 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index e4502bbff33d..9e0a556e40c8 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -41,8 +41,15 @@ struct smc_llc_msg_confirm_link {	/* type 0x01 */
 	u8 reserved[9];
 };
 
+struct smc_llc_msg_test_link {		/* type 0x07 */
+	struct smc_llc_hdr hd;
+	u8 user_data[16];
+	u8 reserved[24];
+};
+
 union smc_llc_msg {
 	struct smc_llc_msg_confirm_link confirm_link;
+	struct smc_llc_msg_test_link test_link;
 	struct {
 		struct smc_llc_hdr hdr;
 		u8 data[SMC_LLC_DATA_LEN];
@@ -130,6 +137,30 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
 	return rc;
 }
 
+/* send LLC test link request or response */
+int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16],
+			   enum smc_llc_reqresp reqresp)
+{
+	struct smc_llc_msg_test_link *testllc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	testllc = (struct smc_llc_msg_test_link *)wr_buf;
+	memset(testllc, 0, sizeof(*testllc));
+	testllc->hd.common.type = SMC_LLC_TEST_LINK;
+	testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
+	if (reqresp == SMC_LLC_RESP)
+		testllc->hd.flags |= SMC_LLC_FLAG_RESP;
+	memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
+	/* send llc message */
+	rc = smc_wr_tx_send(link, pend);
+	return rc;
+}
+
 /********************************* receive ***********************************/
 
 static void smc_llc_rx_confirm_link(struct smc_link *link,
@@ -149,6 +180,16 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
 	}
 }
 
+static void smc_llc_rx_test_link(struct smc_link *link,
+				 struct smc_llc_msg_test_link *llc)
+{
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		/* unused as long as we don't send this type of msg */
+	} else {
+		smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP);
+	}
+}
+
 static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 {
 	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
@@ -158,8 +199,15 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 		return; /* short message */
 	if (llc->raw.hdr.length != sizeof(*llc))
 		return; /* invalid message */
-	if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
+
+	switch (llc->raw.hdr.common.type) {
+	case SMC_LLC_TEST_LINK:
+		smc_llc_rx_test_link(link, &llc->test_link);
+		break;
+	case SMC_LLC_CONFIRM_LINK:
 		smc_llc_rx_confirm_link(link, &llc->confirm_link);
+		break;
+	}
 }
 
 /***************************** init, exit, misc ******************************/
@@ -169,6 +217,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
 		.handler	= smc_llc_rx_handler,
 		.type		= SMC_LLC_CONFIRM_LINK
 	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_TEST_LINK
+	},
 	{
 		.handler	= NULL,
 	}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index a7888607ab53..6c8a062db4f3 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -26,11 +26,14 @@ enum smc_llc_reqresp {
 
 enum smc_llc_msg_type {
 	SMC_LLC_CONFIRM_LINK		= 0x01,
+	SMC_LLC_TEST_LINK		= 0x07,
 };
 
 /* transmit */
 int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
 			      enum smc_llc_reqresp reqresp);
+int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16],
+			   enum smc_llc_reqresp reqresp);
 int smc_llc_init(void) __init;
 
 #endif /* SMC_LLC_H */
-- 
cgit v1.2.3


From 4ed75de58e9191c011e318dc98b4b157dc633444 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Thu, 1 Mar 2018 13:51:30 +0100
Subject: net/smc: process confirm/delete rkey messages

Process and respond to CONFIRM RKEY and DELETE RKEY messages.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c |  48 ++++++++++++++----
 net/smc/smc_core.h |   2 +
 net/smc/smc_llc.c  | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_llc.h  |   3 ++
 4 files changed, 186 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index bc11d06e38ae..31bb2d1dbd77 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -697,27 +697,55 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
 	return -ENOSPC;
 }
 
-/* save rkey and dma_addr received from peer during clc handshake */
-int smc_rmb_rtoken_handling(struct smc_connection *conn,
-			    struct smc_clc_msg_accept_confirm *clc)
+/* add a new rtoken from peer */
+int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey)
 {
-	u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
-	struct smc_link_group *lgr = conn->lgr;
-	u32 rkey = ntohl(clc->rmb_rkey);
+	u64 dma_addr = be64_to_cpu(nw_vaddr);
+	u32 rkey = ntohl(nw_rkey);
 	int i;
 
 	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
 		if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
 		    (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) &&
 		    test_bit(i, lgr->rtokens_used_mask)) {
-			conn->rtoken_idx = i;
+			/* already in list */
+			return i;
+		}
+	}
+	i = smc_rmb_reserve_rtoken_idx(lgr);
+	if (i < 0)
+		return i;
+	lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey;
+	lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr;
+	return i;
+}
+
+/* delete an rtoken */
+int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey)
+{
+	u32 rkey = ntohl(nw_rkey);
+	int i;
+
+	for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+		if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey &&
+		    test_bit(i, lgr->rtokens_used_mask)) {
+			lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0;
+			lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0;
+
+			clear_bit(i, lgr->rtokens_used_mask);
 			return 0;
 		}
 	}
-	conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
+	return -ENOENT;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+			    struct smc_clc_msg_accept_confirm *clc)
+{
+	conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr,
+					  clc->rmb_rkey);
 	if (conn->rtoken_idx < 0)
 		return conn->rtoken_idx;
-	lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
-	lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
 	return 0;
 }
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 7852c3fabf12..7be693b940a2 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -189,6 +189,8 @@ void smc_lgr_terminate(struct smc_link_group *lgr);
 int smc_buf_create(struct smc_sock *smc);
 int smc_rmb_rtoken_handling(struct smc_connection *conn,
 			    struct smc_clc_msg_accept_confirm *clc);
+int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey);
+int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey);
 void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
 void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
 void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 9e0a556e40c8..3e47b94608b5 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -47,8 +47,50 @@ struct smc_llc_msg_test_link {		/* type 0x07 */
 	u8 reserved[24];
 };
 
+struct smc_rmb_rtoken {
+	union {
+		u8 num_rkeys;	/* first rtoken byte of CONFIRM LINK msg */
+				/* is actually the num of rtokens, first */
+				/* rtoken is always for the current link */
+		u8 link_id;	/* link id of the rtoken */
+	};
+	__be32 rmb_key;
+	__be64 rmb_vaddr;
+} __packed;			/* format defined in RFC7609 */
+
+#define SMC_LLC_RKEYS_PER_MSG	3
+
+struct smc_llc_msg_confirm_rkey {	/* type 0x06 */
+	struct smc_llc_hdr hd;
+	struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG];
+	u8 reserved;
+};
+
+struct smc_llc_msg_confirm_rkey_cont {	/* type 0x08 */
+	struct smc_llc_hdr hd;
+	u8 num_rkeys;
+	struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG];
+};
+
+#define SMC_LLC_DEL_RKEY_MAX	8
+#define SMC_LLC_FLAG_RKEY_NEG	0x20
+
+struct smc_llc_msg_delete_rkey {	/* type 0x09 */
+	struct smc_llc_hdr hd;
+	u8 num_rkeys;
+	u8 err_mask;
+	u8 reserved[2];
+	__be32 rkey[8];
+	u8 reserved2[4];
+};
+
 union smc_llc_msg {
 	struct smc_llc_msg_confirm_link confirm_link;
+
+	struct smc_llc_msg_confirm_rkey confirm_rkey;
+	struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont;
+	struct smc_llc_msg_delete_rkey delete_rkey;
+
 	struct smc_llc_msg_test_link test_link;
 	struct {
 		struct smc_llc_hdr hdr;
@@ -161,6 +203,22 @@ int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16],
 	return rc;
 }
 
+/* send a prepared message */
+static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
+{
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	memcpy(wr_buf, llcbuf, llclen);
+	/* send llc message */
+	rc = smc_wr_tx_send(link, pend);
+	return rc;
+}
+
 /********************************* receive ***********************************/
 
 static void smc_llc_rx_confirm_link(struct smc_link *link,
@@ -190,6 +248,70 @@ static void smc_llc_rx_test_link(struct smc_link *link,
 	}
 }
 
+static void smc_llc_rx_confirm_rkey(struct smc_link *link,
+				    struct smc_llc_msg_confirm_rkey *llc)
+{
+	struct smc_link_group *lgr;
+	int rc;
+
+	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		/* unused as long as we don't send this type of msg */
+	} else {
+		rc = smc_rtoken_add(lgr,
+				    llc->rtoken[0].rmb_vaddr,
+				    llc->rtoken[0].rmb_key);
+
+		/* ignore rtokens for other links, we have only one link */
+
+		llc->hd.flags |= SMC_LLC_FLAG_RESP;
+		if (rc < 0)
+			llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+		smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+	}
+}
+
+static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
+				      struct smc_llc_msg_confirm_rkey_cont *llc)
+{
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		/* unused as long as we don't send this type of msg */
+	} else {
+		/* ignore rtokens for other links, we have only one link */
+		llc->hd.flags |= SMC_LLC_FLAG_RESP;
+		smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+	}
+}
+
+static void smc_llc_rx_delete_rkey(struct smc_link *link,
+				   struct smc_llc_msg_delete_rkey *llc)
+{
+	struct smc_link_group *lgr;
+	u8 err_mask = 0;
+	int i, max;
+
+	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		/* unused as long as we don't send this type of msg */
+	} else {
+		max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
+		for (i = 0; i < max; i++) {
+			if (smc_rtoken_delete(lgr, llc->rkey[i]))
+				err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
+		}
+
+		if (err_mask) {
+			llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+			llc->err_mask = err_mask;
+		}
+
+		llc->hd.flags |= SMC_LLC_FLAG_RESP;
+		smc_llc_send_message(link, (void *)llc, sizeof(*llc));
+	}
+}
+
 static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 {
 	struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
@@ -207,6 +329,15 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 	case SMC_LLC_CONFIRM_LINK:
 		smc_llc_rx_confirm_link(link, &llc->confirm_link);
 		break;
+	case SMC_LLC_CONFIRM_RKEY:
+		smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey);
+		break;
+	case SMC_LLC_CONFIRM_RKEY_CONT:
+		smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont);
+		break;
+	case SMC_LLC_DELETE_RKEY:
+		smc_llc_rx_delete_rkey(link, &llc->delete_rkey);
+		break;
 	}
 }
 
@@ -221,6 +352,18 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
 		.handler	= smc_llc_rx_handler,
 		.type		= SMC_LLC_TEST_LINK
 	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_CONFIRM_RKEY
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_CONFIRM_RKEY_CONT
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_DELETE_RKEY
+	},
 	{
 		.handler	= NULL,
 	}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 6c8a062db4f3..5573f0d0578e 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -26,7 +26,10 @@ enum smc_llc_reqresp {
 
 enum smc_llc_msg_type {
 	SMC_LLC_CONFIRM_LINK		= 0x01,
+	SMC_LLC_CONFIRM_RKEY		= 0x06,
 	SMC_LLC_TEST_LINK		= 0x07,
+	SMC_LLC_CONFIRM_RKEY_CONT	= 0x08,
+	SMC_LLC_DELETE_RKEY		= 0x09,
 };
 
 /* transmit */
-- 
cgit v1.2.3


From 75d320d611d8569ebe4e42718de035fcc79f8069 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Thu, 1 Mar 2018 13:51:31 +0100
Subject: net/smc: do not allow eyecatchers in rmbe

SMC does not support eyecatchers in RMB elements,
decline peers requesting this support.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 11 +++++++++--
 net/smc/smc_clc.h  |  1 +
 net/smc/smc_core.h |  2 ++
 net/smc/smc_llc.c  | 16 +++++++++++++++-
 4 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index cda3d5314e3f..0d491f505608 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -291,6 +291,9 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 		return rc;
 	}
 
+	if (link->llc_confirm_rc)
+		return SMC_CLC_DECL_RMBE_EC;
+
 	rc = smc_ib_modify_qp_rts(link);
 	if (rc)
 		return SMC_CLC_DECL_INTERR;
@@ -310,7 +313,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 	if (rc < 0)
 		return SMC_CLC_DECL_TCL;
 
-	return rc;
+	return 0;
 }
 
 static void smc_conn_save_peer_info(struct smc_sock *smc,
@@ -705,9 +708,13 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
 
 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
 				      SMC_CLC_DECLINE);
+		return rc;
 	}
 
-	return rc;
+	if (link->llc_confirm_resp_rc)
+		return SMC_CLC_DECL_RMBE_EC;
+
+	return 0;
 }
 
 /* setup for RDMA connection of server */
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index a20fc75efb24..20e048beac30 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -33,6 +33,7 @@
 #define SMC_CLC_DECL_INTERR	0x99990000  /* internal error                 */
 #define SMC_CLC_DECL_TCL	0x02040000  /* timeout w4 QP confirm          */
 #define SMC_CLC_DECL_SEND	0x07000000  /* sending problem                */
+#define SMC_CLC_DECL_RMBE_EC	0x08000000  /* peer has eyecatcher in RMBE    */
 
 struct smc_clc_msg_hdr {	/* header1 of clc messages */
 	u8 eyecatcher[4];	/* eye catcher */
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 7be693b940a2..2b65b3d7f1f5 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -89,6 +89,8 @@ struct smc_link {
 	u8			link_id;	/* unique # within link group */
 	struct completion	llc_confirm;	/* wait for rx of conf link */
 	struct completion	llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
+	int			llc_confirm_rc; /* rc from confirm link msg */
+	int			llc_confirm_resp_rc; /* rc from conf_resp msg */
 };
 
 /* For now we just allow one parallel link per link group. The SMC protocol
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 3e47b94608b5..838a160a3bd9 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -30,6 +30,8 @@ struct smc_llc_hdr {
 	u8 flags;
 };
 
+#define SMC_LLC_FLAG_NO_RMBE_EYEC	0x03
+
 struct smc_llc_msg_confirm_link {	/* type 0x01 */
 	struct smc_llc_hdr hd;
 	u8 sender_mac[ETH_ALEN];
@@ -166,6 +168,7 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
 	memset(confllc, 0, sizeof(*confllc));
 	confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
 	confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+	confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
 	if (reqresp == SMC_LLC_RESP)
 		confllc->hd.flags |= SMC_LLC_FLAG_RESP;
 	memcpy(confllc->sender_mac, mac, ETH_ALEN);
@@ -225,13 +228,24 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
 				    struct smc_llc_msg_confirm_link *llc)
 {
 	struct smc_link_group *lgr;
+	int conf_rc;
 
 	lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+
+	/* RMBE eyecatchers are not supported */
+	if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)
+		conf_rc = 0;
+	else
+		conf_rc = ENOTSUPP;
+
 	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		if (lgr->role == SMC_SERV)
+		if (lgr->role == SMC_SERV) {
+			link->llc_confirm_resp_rc = conf_rc;
 			complete(&link->llc_confirm_resp);
+		}
 	} else {
 		if (lgr->role == SMC_CLNT) {
+			link->llc_confirm_rc = conf_rc;
 			link->link_id = llc->link_num;
 			complete(&link->llc_confirm);
 		}
-- 
cgit v1.2.3


From 52bedf37bafe1d3bc36d1513ad059d9fd28b3c3f Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Thu, 1 Mar 2018 13:51:32 +0100
Subject: net/smc: process add/delete link messages

Add initial support for the LLC messages ADD LINK and DELETE LINK.
Introduce a link state field. Extend the initial LLC handshake with
ADD LINK processing.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  42 ++++++++++++++
 net/smc/smc_core.c |   3 +
 net/smc/smc_core.h |  10 ++++
 net/smc/smc_llc.c  | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 net/smc/smc_llc.h  |   7 +++
 5 files changed, 223 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0d491f505608..5267ed19b67d 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -313,6 +313,27 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
 	if (rc < 0)
 		return SMC_CLC_DECL_TCL;
 
+	/* receive ADD LINK request from server over RoCE fabric */
+	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
+							 SMC_LLC_WAIT_TIME);
+	if (rest <= 0) {
+		struct smc_clc_msg_decline dclc;
+
+		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+				      SMC_CLC_DECLINE);
+		return rc;
+	}
+
+	/* send add link reject message, only one link supported for now */
+	rc = smc_llc_send_add_link(link,
+				   link->smcibdev->mac[link->ibport - 1],
+				   &link->smcibdev->gid[link->ibport - 1],
+				   SMC_LLC_RESP);
+	if (rc < 0)
+		return SMC_CLC_DECL_TCL;
+
+	link->state = SMC_LNK_ACTIVE;
+
 	return 0;
 }
 
@@ -714,6 +735,27 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
 	if (link->llc_confirm_resp_rc)
 		return SMC_CLC_DECL_RMBE_EC;
 
+	/* send ADD LINK request to client over the RoCE fabric */
+	rc = smc_llc_send_add_link(link,
+				   link->smcibdev->mac[link->ibport - 1],
+				   &link->smcibdev->gid[link->ibport - 1],
+				   SMC_LLC_REQ);
+	if (rc < 0)
+		return SMC_CLC_DECL_TCL;
+
+	/* receive ADD LINK response from client over the RoCE fabric */
+	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
+							 SMC_LLC_WAIT_TIME);
+	if (rest <= 0) {
+		struct smc_clc_msg_decline dclc;
+
+		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+				      SMC_CLC_DECLINE);
+		return rc;
+	}
+
+	link->state = SMC_LNK_ACTIVE;
+
 	return 0;
 }
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 31bb2d1dbd77..eb343e15a723 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -176,6 +176,7 @@ static int smc_lgr_create(struct smc_sock *smc,
 
 	lnk = &lgr->lnk[SMC_SINGLE_LINK];
 	/* initialize link */
+	lnk->state = SMC_LNK_ACTIVATING;
 	lnk->smcibdev = smcibdev;
 	lnk->ibport = ibport;
 	lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
@@ -197,6 +198,8 @@ static int smc_lgr_create(struct smc_sock *smc,
 		goto destroy_qp;
 	init_completion(&lnk->llc_confirm);
 	init_completion(&lnk->llc_confirm_resp);
+	init_completion(&lnk->llc_add);
+	init_completion(&lnk->llc_add_resp);
 
 	smc->conn.lgr = lgr;
 	rwlock_init(&lgr->conns_lock);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 2b65b3d7f1f5..cead2093c4b4 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -32,6 +32,12 @@ enum smc_lgr_role {		/* possible roles of a link group */
 	SMC_SERV	/* server */
 };
 
+enum smc_link_state {			/* possible states of a link */
+	SMC_LNK_INACTIVE,	/* link is inactive */
+	SMC_LNK_ACTIVATING,	/* link is being activated */
+	SMC_LNK_ACTIVE		/* link is active */
+};
+
 #define SMC_WR_BUF_SIZE		48	/* size of work request buffer */
 
 struct smc_wr_buf {
@@ -87,10 +93,14 @@ struct smc_link {
 	u8			peer_mac[ETH_ALEN];	/* = gid[8:10||13:15] */
 	u8			peer_gid[sizeof(union ib_gid)];	/* gid of peer*/
 	u8			link_id;	/* unique # within link group */
+
+	enum smc_link_state	state;		/* state of link */
 	struct completion	llc_confirm;	/* wait for rx of conf link */
 	struct completion	llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
 	int			llc_confirm_rc; /* rc from confirm link msg */
 	int			llc_confirm_resp_rc; /* rc from conf_resp msg */
+	struct completion	llc_add;	/* wait for rx of add link */
+	struct completion	llc_add_resp;	/* wait for rx of add link rsp*/
 };
 
 /* For now we just allow one parallel link per link group. The SMC protocol
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 838a160a3bd9..45b3e0211c39 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -4,9 +4,6 @@
  *
  *  Link Layer Control (LLC)
  *
- *  For now, we only support the necessary "confirm link" functionality
- *  which happens for the first RoCE link after successful CLC handshake.
- *
  *  Copyright IBM Corp. 2016
  *
  *  Author(s):  Klaus Wacker <Klaus.Wacker@de.ibm.com>
@@ -26,7 +23,13 @@
 struct smc_llc_hdr {
 	struct smc_wr_rx_hdr common;
 	u8 length;	/* 44 */
-	u8 reserved;
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 reserved:4,
+	   add_link_rej_rsn:4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 add_link_rej_rsn:4,
+	   reserved:4;
+#endif
 	u8 flags;
 };
 
@@ -43,6 +46,33 @@ struct smc_llc_msg_confirm_link {	/* type 0x01 */
 	u8 reserved[9];
 };
 
+#define SMC_LLC_FLAG_ADD_LNK_REJ	0x40
+#define SMC_LLC_REJ_RSN_NO_ALT_PATH	1
+
+#define SMC_LLC_ADD_LNK_MAX_LINKS	2
+
+struct smc_llc_msg_add_link {		/* type 0x02 */
+	struct smc_llc_hdr hd;
+	u8 sender_mac[ETH_ALEN];
+	u8 reserved2[2];
+	u8 sender_gid[SMC_GID_SIZE];
+	u8 sender_qp_num[3];
+	u8 link_num;
+	u8 flags2;	/* QP mtu */
+	u8 initial_psn[3];
+	u8 reserved[8];
+};
+
+#define SMC_LLC_FLAG_DEL_LINK_ALL	0x40
+#define SMC_LLC_FLAG_DEL_LINK_ORDERLY	0x20
+
+struct smc_llc_msg_del_link {		/* type 0x04 */
+	struct smc_llc_hdr hd;
+	u8 link_num;
+	__be32 reason;
+	u8 reserved[35];
+} __packed;			/* format defined in RFC7609 */
+
 struct smc_llc_msg_test_link {		/* type 0x07 */
 	struct smc_llc_hdr hd;
 	u8 user_data[16];
@@ -88,6 +118,8 @@ struct smc_llc_msg_delete_rkey {	/* type 0x09 */
 
 union smc_llc_msg {
 	struct smc_llc_msg_confirm_link confirm_link;
+	struct smc_llc_msg_add_link add_link;
+	struct smc_llc_msg_del_link delete_link;
 
 	struct smc_llc_msg_confirm_rkey confirm_rkey;
 	struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont;
@@ -176,7 +208,64 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
 	hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
 	/* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
 	memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
-	confllc->max_links = SMC_LINKS_PER_LGR_MAX;
+	confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */
+	/* send llc message */
+	rc = smc_wr_tx_send(link, pend);
+	return rc;
+}
+
+/* send ADD LINK request or response */
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
+			  union ib_gid *gid,
+			  enum smc_llc_reqresp reqresp)
+{
+	struct smc_llc_msg_add_link *addllc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	addllc = (struct smc_llc_msg_add_link *)wr_buf;
+	memset(addllc, 0, sizeof(*addllc));
+	addllc->hd.common.type = SMC_LLC_ADD_LINK;
+	addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+	if (reqresp == SMC_LLC_RESP) {
+		addllc->hd.flags |= SMC_LLC_FLAG_RESP;
+		/* always reject more links for now */
+		addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
+		addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+	}
+	memcpy(addllc->sender_mac, mac, ETH_ALEN);
+	memcpy(addllc->sender_gid, gid, SMC_GID_SIZE);
+	/* send llc message */
+	rc = smc_wr_tx_send(link, pend);
+	return rc;
+}
+
+/* send DELETE LINK request or response */
+int smc_llc_send_delete_link(struct smc_link *link,
+			     enum smc_llc_reqresp reqresp)
+{
+	struct smc_llc_msg_del_link *delllc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_buf *wr_buf;
+	int rc;
+
+	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+	if (rc)
+		return rc;
+	delllc = (struct smc_llc_msg_del_link *)wr_buf;
+	memset(delllc, 0, sizeof(*delllc));
+	delllc->hd.common.type = SMC_LLC_DELETE_LINK;
+	delllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+	if (reqresp == SMC_LLC_RESP)
+		delllc->hd.flags |= SMC_LLC_FLAG_RESP;
+	/* DEL_LINK_ALL because only 1 link supported */
+	delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
+	delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+	delllc->link_num = link->link_id;
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
 	return rc;
@@ -239,12 +328,14 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
 		conf_rc = ENOTSUPP;
 
 	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
-		if (lgr->role == SMC_SERV) {
+		if (lgr->role == SMC_SERV &&
+		    link->state == SMC_LNK_ACTIVATING) {
 			link->llc_confirm_resp_rc = conf_rc;
 			complete(&link->llc_confirm_resp);
 		}
 	} else {
-		if (lgr->role == SMC_CLNT) {
+		if (lgr->role == SMC_CLNT &&
+		    link->state == SMC_LNK_ACTIVATING) {
 			link->llc_confirm_rc = conf_rc;
 			link->link_id = llc->link_num;
 			complete(&link->llc_confirm);
@@ -252,6 +343,55 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
 	}
 }
 
+static void smc_llc_rx_add_link(struct smc_link *link,
+				struct smc_llc_msg_add_link *llc)
+{
+	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+						  lnk[SMC_SINGLE_LINK]);
+
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		if (link->state == SMC_LNK_ACTIVATING)
+			complete(&link->llc_add_resp);
+	} else {
+		if (link->state == SMC_LNK_ACTIVATING) {
+			complete(&link->llc_add);
+			return;
+		}
+
+		if (lgr->role == SMC_SERV) {
+			smc_llc_send_add_link(link,
+					link->smcibdev->mac[link->ibport - 1],
+					&link->smcibdev->gid[link->ibport - 1],
+					SMC_LLC_REQ);
+
+		} else {
+			smc_llc_send_add_link(link,
+					link->smcibdev->mac[link->ibport - 1],
+					&link->smcibdev->gid[link->ibport - 1],
+					SMC_LLC_RESP);
+		}
+	}
+}
+
+static void smc_llc_rx_delete_link(struct smc_link *link,
+				   struct smc_llc_msg_del_link *llc)
+{
+	struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+						  lnk[SMC_SINGLE_LINK]);
+
+	if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+		if (lgr->role == SMC_SERV)
+			smc_lgr_terminate(lgr);
+	} else {
+		if (lgr->role == SMC_SERV) {
+			smc_llc_send_delete_link(link, SMC_LLC_REQ);
+		} else {
+			smc_llc_send_delete_link(link, SMC_LLC_RESP);
+			smc_lgr_terminate(lgr);
+		}
+	}
+}
+
 static void smc_llc_rx_test_link(struct smc_link *link,
 				 struct smc_llc_msg_test_link *llc)
 {
@@ -343,6 +483,12 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 	case SMC_LLC_CONFIRM_LINK:
 		smc_llc_rx_confirm_link(link, &llc->confirm_link);
 		break;
+	case SMC_LLC_ADD_LINK:
+		smc_llc_rx_add_link(link, &llc->add_link);
+		break;
+	case SMC_LLC_DELETE_LINK:
+		smc_llc_rx_delete_link(link, &llc->delete_link);
+		break;
 	case SMC_LLC_CONFIRM_RKEY:
 		smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey);
 		break;
@@ -366,6 +512,14 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
 		.handler	= smc_llc_rx_handler,
 		.type		= SMC_LLC_TEST_LINK
 	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_ADD_LINK
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_DELETE_LINK
+	},
 	{
 		.handler	= smc_llc_rx_handler,
 		.type		= SMC_LLC_CONFIRM_RKEY
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 5573f0d0578e..e4a7d5e234d5 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -18,6 +18,7 @@
 #define SMC_LLC_FLAG_RESP		0x80
 
 #define SMC_LLC_WAIT_FIRST_TIME		(5 * HZ)
+#define SMC_LLC_WAIT_TIME		(2 * HZ)
 
 enum smc_llc_reqresp {
 	SMC_LLC_REQ,
@@ -26,6 +27,8 @@ enum smc_llc_reqresp {
 
 enum smc_llc_msg_type {
 	SMC_LLC_CONFIRM_LINK		= 0x01,
+	SMC_LLC_ADD_LINK		= 0x02,
+	SMC_LLC_DELETE_LINK		= 0x04,
 	SMC_LLC_CONFIRM_RKEY		= 0x06,
 	SMC_LLC_TEST_LINK		= 0x07,
 	SMC_LLC_CONFIRM_RKEY_CONT	= 0x08,
@@ -35,6 +38,10 @@ enum smc_llc_msg_type {
 /* transmit */
 int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
 			      enum smc_llc_reqresp reqresp);
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
+			  enum smc_llc_reqresp reqresp);
+int smc_llc_send_delete_link(struct smc_link *link,
+			     enum smc_llc_reqresp reqresp);
 int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16],
 			   enum smc_llc_reqresp reqresp);
 int smc_llc_init(void) __init;
-- 
cgit v1.2.3


From 9651b9346f5bc85a4fef96789c756748483d9ee2 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Thu, 1 Mar 2018 13:51:33 +0100
Subject: net/smc: prevent new connections on link group

When the processing of a DELETE LINK message has started,
new connections should not be added to the link group that
is about to terminate.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   |  9 ---------
 net/smc/smc_core.c | 19 ++++++++++---------
 net/smc/smc_core.h |  1 +
 net/smc/smc_llc.c  |  1 +
 4 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5267ed19b67d..26684e086750 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -356,15 +356,6 @@ static void smc_link_save_peer_info(struct smc_link *link,
 	link->peer_mtu = clc->qp_mtu;
 }
 
-static void smc_lgr_forget(struct smc_link_group *lgr)
-{
-	spin_lock_bh(&smc_lgr_list.lock);
-	/* do not use this link group for new connections */
-	if (!list_empty(&lgr->list))
-		list_del_init(&lgr->list);
-	spin_unlock_bh(&smc_lgr_list.lock);
-}
-
 /* setup for RDMA connection of client */
 static int smc_connect_rdma(struct smc_sock *smc)
 {
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index eb343e15a723..702ce5f85e97 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -308,6 +308,15 @@ void smc_lgr_free(struct smc_link_group *lgr)
 	kfree(lgr);
 }
 
+void smc_lgr_forget(struct smc_link_group *lgr)
+{
+	spin_lock_bh(&smc_lgr_list.lock);
+	/* do not use this link group for new connections */
+	if (!list_empty(&lgr->list))
+		list_del_init(&lgr->list);
+	spin_unlock_bh(&smc_lgr_list.lock);
+}
+
 /* terminate linkgroup abnormally */
 void smc_lgr_terminate(struct smc_link_group *lgr)
 {
@@ -315,15 +324,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	struct smc_sock *smc;
 	struct rb_node *node;
 
-	spin_lock_bh(&smc_lgr_list.lock);
-	if (list_empty(&lgr->list)) {
-		/* termination already triggered */
-		spin_unlock_bh(&smc_lgr_list.lock);
-		return;
-	}
-	/* do not use this link group for new connections */
-	list_del_init(&lgr->list);
-	spin_unlock_bh(&smc_lgr_list.lock);
+	smc_lgr_forget(lgr);
 
 	write_lock_bh(&lgr->conns_lock);
 	node = rb_first(&lgr->conns_all);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index cead2093c4b4..07e2a393e6d9 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -197,6 +197,7 @@ struct smc_sock;
 struct smc_clc_msg_accept_confirm;
 
 void smc_lgr_free(struct smc_link_group *lgr);
+void smc_lgr_forget(struct smc_link_group *lgr);
 void smc_lgr_terminate(struct smc_link_group *lgr);
 int smc_buf_create(struct smc_sock *smc);
 int smc_rmb_rtoken_handling(struct smc_connection *conn,
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 45b3e0211c39..54e8d6dc9201 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -384,6 +384,7 @@ static void smc_llc_rx_delete_link(struct smc_link *link,
 			smc_lgr_terminate(lgr);
 	} else {
 		if (lgr->role == SMC_SERV) {
+			smc_lgr_forget(lgr);
 			smc_llc_send_delete_link(link, SMC_LLC_REQ);
 		} else {
 			smc_llc_send_delete_link(link, SMC_LLC_RESP);
-- 
cgit v1.2.3


From 3a053b1a30dcb4e39569bcce2f4357509260db75 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Wed, 28 Feb 2018 15:59:15 +0200
Subject: net: Fix spelling mistake "greater then" -> "greater than"

Fix trivial spelling mistake "greater then" -> "greater than".

Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 5bdcc5a161fe..40fb3aed5df2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2378,7 +2378,7 @@ EXPORT_SYMBOL(netdev_set_num_tc);
 
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
- * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
+ * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
  */
 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 {
-- 
cgit v1.2.3


From f1c02cfb7b30f5c9d9f4e383260abf89d89c3341 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Wed, 28 Feb 2018 16:40:08 +0100
Subject: ipv6: allow userspace to add IFA_F_OPTIMISTIC addresses

According to RFC 4429 (section 3.1), adding new IPv6 addresses as
optimistic addresses is acceptable, as long as the implementation
follows some rules:

   * Optimistic DAD SHOULD only be used when the implementation is aware
        that the address is based on a most likely unique interface
        identifier (such as in [RFC2464]), generated randomly [RFC3041],
        or by a well-distributed hash function [RFC3972] or assigned by
        Dynamic Host Configuration Protocol for IPv6 (DHCPv6) [RFC3315].
        Optimistic DAD SHOULD NOT be used for manually entered
        addresses.

Thus, it seems reasonable to allow userspace to set the optimistic flag
when adding new addresses.

We must not let userspace set NODAD + OPTIMISTIC, since if the kernel is
not performing DAD we would never clear the optimistic flag. We must
also ignore userspace's request to add OPTIMISTIC flag to addresses that
have already completed DAD (addresses that don't have the TENTATIVE
flag, or that have the DADFAILED flag).

Then we also need to clear the OPTIMISTIC flag on permanent addresses
when DAD fails. Otherwise, IFA_F_OPTIMISTIC addresses added by userspace
can still be used after DAD has failed, because in
ipv6_chk_addr_and_flags(), IFA_F_OPTIMISTIC overrides IFA_F_TENTATIVE.

Setting IFA_F_OPTIMISTIC from userspace is conditional on
CONFIG_IPV6_OPTIMISTIC_DAD and the optimistic_dad sysctl.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4facfe0b1888..b5fd116c046a 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1459,6 +1459,21 @@ static bool ipv6_use_optimistic_addr(struct net *net,
 #endif
 }
 
+static bool ipv6_allow_optimistic_dad(struct net *net,
+				      struct inet6_dev *idev)
+{
+#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
+	if (!idev)
+		return false;
+	if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad)
+		return false;
+
+	return true;
+#else
+	return false;
+#endif
+}
+
 static int ipv6_get_saddr_eval(struct net *net,
 			       struct ipv6_saddr_score *score,
 			       struct ipv6_saddr_dst *dst,
@@ -1968,6 +1983,8 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
 		spin_lock_bh(&ifp->lock);
 		addrconf_del_dad_work(ifp);
 		ifp->flags |= IFA_F_TENTATIVE;
+		if (dad_failed)
+			ifp->flags &= ~IFA_F_OPTIMISTIC;
 		spin_unlock_bh(&ifp->lock);
 		if (dad_failed)
 			ipv6_ifa_notify(0, ifp);
@@ -4501,6 +4518,9 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
 	    (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
 		return -EINVAL;
 
+	if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
+		ifa_flags &= ~IFA_F_OPTIMISTIC;
+
 	timeout = addrconf_timeout_fixup(valid_lft, HZ);
 	if (addrconf_finite_timeout(timeout)) {
 		expires = jiffies_to_clock_t(timeout * HZ);
@@ -4574,6 +4594,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct in6_addr *pfx, *peer_pfx;
 	struct inet6_ifaddr *ifa;
 	struct net_device *dev;
+	struct inet6_dev *idev;
 	u32 valid_lft = INFINITY_LIFE_TIME, preferred_lft = INFINITY_LIFE_TIME;
 	u32 ifa_flags;
 	int err;
@@ -4607,7 +4628,19 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	/* We ignore other flags so far. */
 	ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
-		     IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN;
+		     IFA_F_NOPREFIXROUTE | IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+
+	idev = ipv6_find_idev(dev);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	if (!ipv6_allow_optimistic_dad(net, idev))
+		ifa_flags &= ~IFA_F_OPTIMISTIC;
+
+	if (ifa_flags & IFA_F_NODAD && ifa_flags & IFA_F_OPTIMISTIC) {
+		NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
+		return -EINVAL;
+	}
 
 	ifa = ipv6_get_ifaddr(net, pfx, dev, 1);
 	if (!ifa) {
-- 
cgit v1.2.3


From 7797dc41417eb0e03f39fc4356f7635bcdef108e Mon Sep 17 00:00:00 2001
From: Soheil Hassas Yeganeh <soheil@google.com>
Date: Tue, 27 Feb 2018 18:22:40 -0500
Subject: socket: skip checking sk_err for recvmmsg(MSG_ERRQUEUE)

recvmmsg does not call ___sys_recvmsg when sk_err is set.
That is fine for normal reads but, for MSG_ERRQUEUE, recvmmsg
should always call ___sys_recvmsg regardless of sk->sk_err to
be able to clear error queue. Otherwise, users are not able to
drain the error queue using recvmmsg.

Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/socket.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/socket.c b/net/socket.c
index 645d32b4872c..d9a1ac233b35 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2289,10 +2289,12 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
 	if (!sock)
 		return err;
 
-	err = sock_error(sock->sk);
-	if (err) {
-		datagrams = err;
-		goto out_put;
+	if (likely(!(flags & MSG_ERRQUEUE))) {
+		err = sock_error(sock->sk);
+		if (err) {
+			datagrams = err;
+			goto out_put;
+		}
 	}
 
 	entry = mmsg;
-- 
cgit v1.2.3


From dcb8c9b4373a583451b1b8a3e916d33de273633d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Feb 2018 14:40:46 -0800
Subject: tcp_bbr: better deal with suboptimal GSO (II)

This is second part of dealing with suboptimal device gso parameters.
In first patch (350c9f484bde "tcp_bbr: better deal with suboptimal GSO")
we dealt with devices having low gso_max_segs

Some devices lower gso_max_size from 64KB to 16 KB (r8152 is an example)

In order to probe an optimal cwnd, we want BBR being not sensitive
to whatever GSO constraint a device can have.

This patch removes tso_segs_goal() CC callback in favor of
min_tso_segs() for CC wanting to override sysctl_tcp_min_tso_segs

Next patch will remove bbr->tso_segs_goal since it does not have
to be persistent.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c    | 23 +++++++++++++----------
 net/ipv4/tcp_output.c | 15 ++++++++-------
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index a471f696e13c..afc0567b8a98 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -261,23 +261,26 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 		sk->sk_pacing_rate = rate;
 }
 
-/* Return count of segments we want in the skbs we send, or 0 for default. */
-static u32 bbr_tso_segs_goal(struct sock *sk)
+/* override sysctl_tcp_min_tso_segs */
+static u32 bbr_min_tso_segs(struct sock *sk)
 {
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	return bbr->tso_segs_goal;
+	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
 
 static void bbr_set_tso_segs_goal(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u32 min_segs;
+	u32 segs, bytes;
+
+	/* Sort of tcp_tso_autosize() but ignoring
+	 * driver provided sk_gso_max_size.
+	 */
+	bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
+		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
-	min_segs = sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
-	bbr->tso_segs_goal = min(tcp_tso_autosize(sk, tp->mss_cache, min_segs),
-				 0x7FU);
+	bbr->tso_segs_goal = min(segs, 0x7FU);
 }
 
 /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -936,7 +939,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
 	.undo_cwnd	= bbr_undo_cwnd,
 	.cwnd_event	= bbr_cwnd_event,
 	.ssthresh	= bbr_ssthresh,
-	.tso_segs_goal	= bbr_tso_segs_goal,
+	.min_tso_segs	= bbr_min_tso_segs,
 	.get_info	= bbr_get_info,
 	.set_state	= bbr_set_state,
 };
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 49d043de3476..383cac0ff0ec 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1703,8 +1703,8 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
 /* Return how many segs we'd like on a TSO packet,
  * to send one TSO packet per ms
  */
-u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
-		     int min_tso_segs)
+static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
+			    int min_tso_segs)
 {
 	u32 bytes, segs;
 
@@ -1720,7 +1720,6 @@ u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 
 	return segs;
 }
-EXPORT_SYMBOL(tcp_tso_autosize);
 
 /* Return the number of segments we want in the skb we are transmitting.
  * See if congestion control module wants to decide; otherwise, autosize.
@@ -1728,11 +1727,13 @@ EXPORT_SYMBOL(tcp_tso_autosize);
 static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 {
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+	u32 min_tso, tso_segs;
 
-	if (!tso_segs)
-		tso_segs = tcp_tso_autosize(sk, mss_now,
-				sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+	min_tso = ca_ops->min_tso_segs ?
+			ca_ops->min_tso_segs(sk) :
+			sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
+
+	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
 }
 
-- 
cgit v1.2.3


From 71abf467bb630c7e2f4ef33267d98fb7d10d3ce9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 28 Feb 2018 14:40:47 -0800
Subject: tcp_bbr: remove bbr->tso_segs_goal

Its value is computed then immediately used,
there is no need to store it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index afc0567b8a98..c92014cb1e16 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -97,10 +97,9 @@ struct bbr {
 		packet_conservation:1,  /* use packet conservation? */
 		restore_cwnd:1,	     /* decided to revert cwnd to old value */
 		round_start:1,	     /* start of packet-timed tx->ack round? */
-		tso_segs_goal:7,     /* segments we want in each skb we send */
 		idle_restart:1,	     /* restarting after idle? */
 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
-		unused:5,
+		unused:12,
 		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
 		lt_rtt_cnt:7,	     /* round trips in long-term interval */
 		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
@@ -267,10 +266,9 @@ static u32 bbr_min_tso_segs(struct sock *sk)
 	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
 }
 
-static void bbr_set_tso_segs_goal(struct sock *sk)
+static u32 bbr_tso_segs_goal(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
 	u32 segs, bytes;
 
 	/* Sort of tcp_tso_autosize() but ignoring
@@ -280,7 +278,7 @@ static void bbr_set_tso_segs_goal(struct sock *sk)
 		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
 	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
-	bbr->tso_segs_goal = min(segs, 0x7FU);
+	return min(segs, 0x7FU);
 }
 
 /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -351,7 +349,7 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
 	cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
 
 	/* Allow enough full-sized skbs in flight to utilize end systems. */
-	cwnd += 3 * bbr->tso_segs_goal;
+	cwnd += 3 * bbr_tso_segs_goal(sk);
 
 	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
 	cwnd = (cwnd + 1) & ~1U;
@@ -827,7 +825,6 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs)
 
 	bw = bbr_bw(sk);
 	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
-	bbr_set_tso_segs_goal(sk);
 	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
 }
 
@@ -837,7 +834,6 @@ static void bbr_init(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->prior_cwnd = 0;
-	bbr->tso_segs_goal = 0;	 /* default segs per skb until first ACK */
 	bbr->rtt_cnt = 0;
 	bbr->next_rtt_delivered = 0;
 	bbr->prev_ca_state = TCP_CA_Open;
-- 
cgit v1.2.3


From f84af33138a827dbdf7ee0df2ed82377b43cd1cc Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:10 +0800
Subject: sctp: factor out sctp_sendmsg_to_asoc from sctp_sendmsg

This patch is to move the codes for checking and sending on
one asoc after this asoc has been found or created into
sctp_sendmsg_to_asoc.

Note that 'err != -ESRCH' check is for the case that asoc is
freed when waiting for tx buffer in sctp_sendmsg_to_asoc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 230 +++++++++++++++++++++++-------------------------------
 1 file changed, 99 insertions(+), 131 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf271f8c2dc9..183129e2bc68 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1606,6 +1606,100 @@ static int sctp_error(struct sock *sk, int flags, int err)
 static int sctp_msghdr_parse(const struct msghdr *msg,
 			     struct sctp_cmsgs *cmsgs);
 
+static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
+				struct msghdr *msg, size_t msg_len,
+				struct sctp_transport *transport,
+				struct sctp_sndrcvinfo *sinfo)
+{
+	struct sock *sk = asoc->base.sk;
+	struct net *net = sock_net(sk);
+	struct sctp_datamsg *datamsg;
+	bool wait_connect = false;
+	struct sctp_chunk *chunk;
+	long timeo;
+	int err;
+
+	if (sinfo->sinfo_stream >= asoc->stream.outcnt) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) {
+		err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream);
+		if (err)
+			goto err;
+	}
+
+	if (sctp_sk(sk)->disable_fragments && msg_len > asoc->frag_point) {
+		err = -EMSGSIZE;
+		goto err;
+	}
+
+	if (sctp_state(asoc, CLOSED)) {
+		err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
+		if (err)
+			goto err;
+
+		if (sctp_sk(sk)->strm_interleave) {
+			timeo = sock_sndtimeo(sk, 0);
+			err = sctp_wait_for_connect(asoc, &timeo);
+			if (err)
+				goto err;
+		} else {
+			wait_connect = true;
+		}
+
+		pr_debug("%s: we associated primitively\n", __func__);
+	}
+
+	if (asoc->pmtu_pending)
+		sctp_assoc_pending_pmtu(asoc);
+
+	if (sctp_wspace(asoc) < msg_len)
+		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
+
+	if (!sctp_wspace(asoc)) {
+		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
+		if (err)
+			goto err;
+	}
+
+	datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
+	if (IS_ERR(datamsg)) {
+		err = PTR_ERR(datamsg);
+		goto err;
+	}
+
+	asoc->force_delay = !!(msg->msg_flags & MSG_MORE);
+
+	list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
+		sctp_chunk_hold(chunk);
+		sctp_set_owner_w(chunk);
+		chunk->transport = transport;
+	}
+
+	err = sctp_primitive_SEND(net, asoc, datamsg);
+	if (err) {
+		sctp_datamsg_free(datamsg);
+		goto err;
+	}
+
+	pr_debug("%s: we sent primitively\n", __func__);
+
+	sctp_datamsg_put(datamsg);
+
+	if (unlikely(wait_connect)) {
+		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+		sctp_wait_for_connect(asoc, &timeo);
+	}
+
+	err = msg_len;
+
+err:
+	return err;
+}
+
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
 	struct net *net = sock_net(sk);
@@ -1622,11 +1716,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	sctp_assoc_t associd = 0;
 	struct sctp_cmsgs cmsgs = { NULL };
 	enum sctp_scope scope;
-	bool fill_sinfo_ttl = false, wait_connect = false;
-	struct sctp_datamsg *datamsg;
-	int msg_flags = msg->msg_flags;
+	bool fill_sinfo_ttl = false;
 	__u16 sinfo_flags = 0;
-	long timeo;
 	int err;
 
 	err = 0;
@@ -1923,49 +2014,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		goto out_free;
 	}
 
-	if (asoc->pmtu_pending)
-		sctp_assoc_pending_pmtu(asoc);
-
-	/* If fragmentation is disabled and the message length exceeds the
-	 * association fragmentation point, return EMSGSIZE.  The I-D
-	 * does not specify what this error is, but this looks like
-	 * a great fit.
-	 */
-	if (sctp_sk(sk)->disable_fragments && (msg_len > asoc->frag_point)) {
-		err = -EMSGSIZE;
-		goto out_free;
-	}
-
-	/* Check for invalid stream. */
-	if (sinfo->sinfo_stream >= asoc->stream.outcnt) {
-		err = -EINVAL;
-		goto out_free;
-	}
-
-	/* Allocate sctp_stream_out_ext if not already done */
-	if (unlikely(!asoc->stream.out[sinfo->sinfo_stream].ext)) {
-		err = sctp_stream_init_ext(&asoc->stream, sinfo->sinfo_stream);
-		if (err)
-			goto out_free;
-	}
-
-	if (sctp_wspace(asoc) < msg_len)
-		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
-
-	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-	if (!sctp_wspace(asoc)) {
-		/* sk can be changed by peel off when waiting for buf. */
-		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
-		if (err) {
-			if (err == -ESRCH) {
-				/* asoc is already dead. */
-				new_asoc = NULL;
-				err = -EPIPE;
-			}
-			goto out_free;
-		}
-	}
-
 	/* If an address is passed with the sendto/sendmsg call, it is used
 	 * to override the primary destination address in the TCP model, or
 	 * when SCTP_ADDR_OVER flag is set in the UDP model.
@@ -1980,96 +2028,16 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	} else
 		chunk_tp = NULL;
 
-	/* Auto-connect, if we aren't connected already. */
-	if (sctp_state(asoc, CLOSED)) {
-		err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
-		if (err < 0)
-			goto out_free;
-
-		/* If stream interleave is enabled, wait_connect has to be
-		 * done earlier than data enqueue, as it needs to make data
-		 * or idata according to asoc->intl_enable which is set
-		 * after connection is done.
-		 */
-		if (sctp_sk(asoc->base.sk)->strm_interleave) {
-			timeo = sock_sndtimeo(sk, 0);
-			err = sctp_wait_for_connect(asoc, &timeo);
-			if (err)
-				goto out_unlock;
-		} else {
-			wait_connect = true;
-		}
-
-		pr_debug("%s: we associated primitively\n", __func__);
-	}
-
-	/* Break the message into multiple chunks of maximum size. */
-	datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
-	if (IS_ERR(datamsg)) {
-		err = PTR_ERR(datamsg);
-		goto out_free;
-	}
-	asoc->force_delay = !!(msg->msg_flags & MSG_MORE);
-
-	/* Now send the (possibly) fragmented message. */
-	list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
-		sctp_chunk_hold(chunk);
-
-		/* Do accounting for the write space.  */
-		sctp_set_owner_w(chunk);
-
-		chunk->transport = chunk_tp;
-	}
-
-	/* Send it to the lower layers.  Note:  all chunks
-	 * must either fail or succeed.   The lower layer
-	 * works that way today.  Keep it that way or this
-	 * breaks.
-	 */
-	err = sctp_primitive_SEND(net, asoc, datamsg);
-	/* Did the lower layer accept the chunk? */
-	if (err) {
-		sctp_datamsg_free(datamsg);
-		goto out_free;
-	}
-
-	pr_debug("%s: we sent primitively\n", __func__);
-
-	sctp_datamsg_put(datamsg);
-	err = msg_len;
-
-	if (unlikely(wait_connect)) {
-		timeo = sock_sndtimeo(sk, msg_flags & MSG_DONTWAIT);
-		sctp_wait_for_connect(asoc, &timeo);
-	}
-
-	/* If we are already past ASSOCIATE, the lower
-	 * layers are responsible for association cleanup.
-	 */
-	goto out_unlock;
+	/* Send msg to the asoc */
+	err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, chunk_tp, sinfo);
 
 out_free:
-	if (new_asoc)
+	if (err < 0 && err != -ESRCH && new_asoc)
 		sctp_association_free(asoc);
 out_unlock:
 	release_sock(sk);
-
 out_nounlock:
-	return sctp_error(sk, msg_flags, err);
-
-#if 0
-do_sock_err:
-	if (msg_len)
-		err = msg_len;
-	else
-		err = sock_error(sk);
-	goto out;
-
-do_interrupted:
-	if (msg_len)
-		err = msg_len;
-	goto out;
-#endif /* 0 */
+	return sctp_error(sk, msg->msg_flags, err);
 }
 
 /* This is an extended version of skb_pull() that removes the data from the
-- 
cgit v1.2.3


From 2bfd80f9edbfc18c6247be929df3b348329141a0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:11 +0800
Subject: sctp: factor out sctp_sendmsg_new_asoc from sctp_sendmsg

This patch is to move the codes for creating a new asoc if
no asoc was found into sctp_sendmsg_new_asoc.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 201 +++++++++++++++++++++++-------------------------------
 1 file changed, 86 insertions(+), 115 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 183129e2bc68..58bb55dce8f6 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1606,6 +1606,87 @@ static int sctp_error(struct sock *sk, int flags, int err)
 static int sctp_msghdr_parse(const struct msghdr *msg,
 			     struct sctp_cmsgs *cmsgs);
 
+static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
+				 struct sctp_cmsgs *cmsgs,
+				 union sctp_addr *daddr,
+				 struct sctp_transport **tp)
+{
+	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+	struct net *net = sock_net(sk);
+	struct sctp_association *asoc;
+	enum sctp_scope scope;
+	int err = -EINVAL;
+
+	*tp = NULL;
+
+	if (sflags & (SCTP_EOF | SCTP_ABORT))
+		return -EINVAL;
+
+	if (sctp_style(sk, TCP) && (sctp_sstate(sk, ESTABLISHED) ||
+				    sctp_sstate(sk, CLOSING)))
+		return -EADDRNOTAVAIL;
+
+	if (sctp_endpoint_is_peeled_off(ep, daddr))
+		return -EADDRNOTAVAIL;
+
+	if (!ep->base.bind_addr.port) {
+		if (sctp_autobind(sk))
+			return -EAGAIN;
+	} else {
+		if (ep->base.bind_addr.port < inet_prot_sock(net) &&
+		    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+			return -EACCES;
+	}
+
+	scope = sctp_scope(daddr);
+
+	asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+	if (!asoc)
+		return -ENOMEM;
+
+	if (sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL) < 0) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	if (cmsgs->init) {
+		struct sctp_initmsg *init = cmsgs->init;
+
+		if (init->sinit_num_ostreams) {
+			__u16 outcnt = init->sinit_num_ostreams;
+
+			asoc->c.sinit_num_ostreams = outcnt;
+			/* outcnt has been changed, need to re-init stream */
+			err = sctp_stream_init(&asoc->stream, outcnt, 0,
+					       GFP_KERNEL);
+			if (err)
+				goto free;
+		}
+
+		if (init->sinit_max_instreams)
+			asoc->c.sinit_max_instreams = init->sinit_max_instreams;
+
+		if (init->sinit_max_attempts)
+			asoc->max_init_attempts = init->sinit_max_attempts;
+
+		if (init->sinit_max_init_timeo)
+			asoc->max_init_timeo =
+				msecs_to_jiffies(init->sinit_max_init_timeo);
+	}
+
+	*tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+	if (!*tp) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	return 0;
+
+free:
+	sctp_association_free(asoc);
+	return err;
+}
+
 static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 				struct msghdr *msg, size_t msg_len,
 				struct sctp_transport *transport,
@@ -1715,7 +1796,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	struct sctp_initmsg *sinit;
 	sctp_assoc_t associd = 0;
 	struct sctp_cmsgs cmsgs = { NULL };
-	enum sctp_scope scope;
 	bool fill_sinfo_ttl = false;
 	__u16 sinfo_flags = 0;
 	int err;
@@ -1817,20 +1897,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	if (msg_name) {
 		/* Look for a matching association on the endpoint. */
 		asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
-
-		/* If we could not find a matching association on the
-		 * endpoint, make sure that it is not a TCP-style
-		 * socket that already has an association or there is
-		 * no peeled-off association on another socket.
-		 */
-		if (!asoc &&
-		    ((sctp_style(sk, TCP) &&
-		      (sctp_sstate(sk, ESTABLISHED) ||
-		       sctp_sstate(sk, CLOSING))) ||
-		     sctp_endpoint_is_peeled_off(ep, &to))) {
-			err = -EADDRNOTAVAIL;
-			goto out_unlock;
-		}
 	} else {
 		asoc = sctp_id2assoc(sk, associd);
 		if (!asoc) {
@@ -1879,108 +1945,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	/* Do we need to create the association?  */
 	if (!asoc) {
-		pr_debug("%s: there is no association yet\n", __func__);
-
-		if (sinfo_flags & (SCTP_EOF | SCTP_ABORT)) {
-			err = -EINVAL;
-			goto out_unlock;
-		}
-
-		/* Check for invalid stream against the stream counts,
-		 * either the default or the user specified stream counts.
-		 */
-		if (sinfo) {
-			if (!sinit || !sinit->sinit_num_ostreams) {
-				/* Check against the defaults. */
-				if (sinfo->sinfo_stream >=
-				    sp->initmsg.sinit_num_ostreams) {
-					err = -EINVAL;
-					goto out_unlock;
-				}
-			} else {
-				/* Check against the requested.  */
-				if (sinfo->sinfo_stream >=
-				    sinit->sinit_num_ostreams) {
-					err = -EINVAL;
-					goto out_unlock;
-				}
-			}
-		}
-
-		/*
-		 * API 3.1.2 bind() - UDP Style Syntax
-		 * If a bind() or sctp_bindx() is not called prior to a
-		 * sendmsg() call that initiates a new association, the
-		 * system picks an ephemeral port and will choose an address
-		 * set equivalent to binding with a wildcard address.
-		 */
-		if (!ep->base.bind_addr.port) {
-			if (sctp_autobind(sk)) {
-				err = -EAGAIN;
-				goto out_unlock;
-			}
-		} else {
-			/*
-			 * If an unprivileged user inherits a one-to-many
-			 * style socket with open associations on a privileged
-			 * port, it MAY be permitted to accept new associations,
-			 * but it SHOULD NOT be permitted to open new
-			 * associations.
-			 */
-			if (ep->base.bind_addr.port < inet_prot_sock(net) &&
-			    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) {
-				err = -EACCES;
-				goto out_unlock;
-			}
-		}
-
-		scope = sctp_scope(&to);
-		new_asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
-		if (!new_asoc) {
-			err = -ENOMEM;
+		err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, &to,
+					    &transport);
+		if (err)
 			goto out_unlock;
-		}
-		asoc = new_asoc;
-		err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
-		if (err < 0) {
-			err = -ENOMEM;
-			goto out_free;
-		}
-
-		/* If the SCTP_INIT ancillary data is specified, set all
-		 * the association init values accordingly.
-		 */
-		if (sinit) {
-			if (sinit->sinit_num_ostreams) {
-				__u16 outcnt = sinit->sinit_num_ostreams;
-
-				asoc->c.sinit_num_ostreams = outcnt;
-				/* outcnt has been changed, so re-init stream */
-				err = sctp_stream_init(&asoc->stream, outcnt, 0,
-						       GFP_KERNEL);
-				if (err)
-					goto out_free;
-			}
-			if (sinit->sinit_max_instreams) {
-				asoc->c.sinit_max_instreams =
-					sinit->sinit_max_instreams;
-			}
-			if (sinit->sinit_max_attempts) {
-				asoc->max_init_attempts
-					= sinit->sinit_max_attempts;
-			}
-			if (sinit->sinit_max_init_timeo) {
-				asoc->max_init_timeo =
-				 msecs_to_jiffies(sinit->sinit_max_init_timeo);
-			}
-		}
 
-		/* Prime the peer's transport structures.  */
-		transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL, SCTP_UNKNOWN);
-		if (!transport) {
-			err = -ENOMEM;
-			goto out_free;
-		}
+		asoc = transport->asoc;
+		new_asoc = asoc;
 	}
 
 	/* ASSERT: we have a valid association at this point.  */
-- 
cgit v1.2.3


From c2666de1fde3a02583c2c4d4af4bc54f7252e891 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:12 +0800
Subject: sctp: factor out sctp_sendmsg_check_sflags from sctp_sendmsg

This patch is to move the codes for checking sinfo_flags on one asoc
after this asoc has been found into sctp_sendmsg_check_sflags.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 72 +++++++++++++++++++++++++++----------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 58bb55dce8f6..93cff9963614 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1687,6 +1687,39 @@ free:
 	return err;
 }
 
+static int sctp_sendmsg_check_sflags(struct sctp_association *asoc,
+				     __u16 sflags, struct msghdr *msg,
+				     size_t msg_len)
+{
+	struct sock *sk = asoc->base.sk;
+	struct net *net = sock_net(sk);
+
+	if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP))
+		return -EPIPE;
+
+	if (sflags & SCTP_EOF) {
+		pr_debug("%s: shutting down association:%p\n", __func__, asoc);
+		sctp_primitive_SHUTDOWN(net, asoc, NULL);
+
+		return 0;
+	}
+
+	if (sflags & SCTP_ABORT) {
+		struct sctp_chunk *chunk;
+
+		chunk = sctp_make_abort_user(asoc, msg, msg_len);
+		if (!chunk)
+			return -ENOMEM;
+
+		pr_debug("%s: aborting association:%p\n", __func__, asoc);
+		sctp_primitive_ABORT(net, asoc, chunk);
+
+		return 0;
+	}
+
+	return 1;
+}
+
 static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 				struct msghdr *msg, size_t msg_len,
 				struct sctp_transport *transport,
@@ -1783,12 +1816,10 @@ err:
 
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
-	struct net *net = sock_net(sk);
 	struct sctp_sock *sp;
 	struct sctp_endpoint *ep;
 	struct sctp_association *new_asoc = NULL, *asoc = NULL;
 	struct sctp_transport *transport, *chunk_tp;
-	struct sctp_chunk *chunk;
 	union sctp_addr to;
 	struct sockaddr *msg_name = NULL;
 	struct sctp_sndrcvinfo default_sinfo;
@@ -1906,41 +1937,10 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	}
 
 	if (asoc) {
-		pr_debug("%s: just looked up association:%p\n", __func__, asoc);
-
-		/* We cannot send a message on a TCP-style SCTP_SS_ESTABLISHED
-		 * socket that has an association in CLOSED state. This can
-		 * happen when an accepted socket has an association that is
-		 * already CLOSED.
-		 */
-		if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) {
-			err = -EPIPE;
-			goto out_unlock;
-		}
-
-		if (sinfo_flags & SCTP_EOF) {
-			pr_debug("%s: shutting down association:%p\n",
-				 __func__, asoc);
-
-			sctp_primitive_SHUTDOWN(net, asoc, NULL);
-			err = 0;
+		err = sctp_sendmsg_check_sflags(asoc, sinfo_flags, msg,
+						msg_len);
+		if (err <= 0)
 			goto out_unlock;
-		}
-		if (sinfo_flags & SCTP_ABORT) {
-
-			chunk = sctp_make_abort_user(asoc, msg, msg_len);
-			if (!chunk) {
-				err = -ENOMEM;
-				goto out_unlock;
-			}
-
-			pr_debug("%s: aborting association:%p\n",
-				 __func__, asoc);
-
-			sctp_primitive_ABORT(net, asoc, chunk);
-			err = 0;
-			goto out_unlock;
-		}
 	}
 
 	/* Do we need to create the association?  */
-- 
cgit v1.2.3


From becef9b1e249c849ca15889586e02668d6d723a2 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:13 +0800
Subject: sctp: factor out sctp_sendmsg_get_daddr from sctp_sendmsg

This patch is to move the codes for trying to get daddr from
msg->msg_name into sctp_sendmsg_get_daddr.

Note that after adding 'daddr', 'to' and 'msg_name' can be
deleted.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 58 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 93cff9963614..68691d2b3167 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1814,14 +1814,35 @@ err:
 	return err;
 }
 
+static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk,
+					       const struct msghdr *msg,
+					       struct sctp_cmsgs *cmsgs)
+{
+	union sctp_addr *daddr = NULL;
+	int err;
+
+	if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) {
+		int len = msg->msg_namelen;
+
+		if (len > sizeof(*daddr))
+			len = sizeof(*daddr);
+
+		daddr = (union sctp_addr *)msg->msg_name;
+
+		err = sctp_verify_addr(sk, daddr, len);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	return daddr;
+}
+
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
 	struct sctp_sock *sp;
 	struct sctp_endpoint *ep;
 	struct sctp_association *new_asoc = NULL, *asoc = NULL;
 	struct sctp_transport *transport, *chunk_tp;
-	union sctp_addr to;
-	struct sockaddr *msg_name = NULL;
 	struct sctp_sndrcvinfo default_sinfo;
 	struct sctp_sndrcvinfo *sinfo;
 	struct sctp_initmsg *sinit;
@@ -1829,6 +1850,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	struct sctp_cmsgs cmsgs = { NULL };
 	bool fill_sinfo_ttl = false;
 	__u16 sinfo_flags = 0;
+	union sctp_addr *daddr;
 	int err;
 
 	err = 0;
@@ -1851,23 +1873,11 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		goto out_nounlock;
 	}
 
-	/* Fetch the destination address for this packet.  This
-	 * address only selects the association--it is not necessarily
-	 * the address we will send to.
-	 * For a peeled-off socket, msg_name is ignored.
-	 */
-	if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) {
-		int msg_namelen = msg->msg_namelen;
-
-		err = sctp_verify_addr(sk, (union sctp_addr *)msg->msg_name,
-				       msg_namelen);
-		if (err)
-			return err;
-
-		if (msg_namelen > sizeof(to))
-			msg_namelen = sizeof(to);
-		memcpy(&to, msg->msg_name, msg_namelen);
-		msg_name = msg->msg_name;
+	/* Get daddr from msg */
+	daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs);
+	if (IS_ERR(daddr)) {
+		err = PTR_ERR(daddr);
+		goto out_nounlock;
 	}
 
 	sinit = cmsgs.init;
@@ -1925,9 +1935,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	lock_sock(sk);
 
 	/* If a msg_name has been specified, assume this is to be used.  */
-	if (msg_name) {
+	if (daddr) {
 		/* Look for a matching association on the endpoint. */
-		asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport);
+		asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
 	} else {
 		asoc = sctp_id2assoc(sk, associd);
 		if (!asoc) {
@@ -1945,7 +1955,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	/* Do we need to create the association?  */
 	if (!asoc) {
-		err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, &to,
+		err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, daddr,
 					    &transport);
 		if (err)
 			goto out_unlock;
@@ -1989,9 +1999,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	 * to override the primary destination address in the TCP model, or
 	 * when SCTP_ADDR_OVER flag is set in the UDP model.
 	 */
-	if ((sctp_style(sk, TCP) && msg_name) ||
+	if ((sctp_style(sk, TCP) && daddr) ||
 	    (sinfo_flags & SCTP_ADDR_OVER)) {
-		chunk_tp = sctp_assoc_lookup_paddr(asoc, &to);
+		chunk_tp = sctp_assoc_lookup_paddr(asoc, daddr);
 		if (!chunk_tp) {
 			err = -EINVAL;
 			goto out_free;
-- 
cgit v1.2.3


From 204f817fb9ab1b42c34c27d593cad2992d575f71 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:14 +0800
Subject: sctp: factor out sctp_sendmsg_parse from sctp_sendmsg

This patch is to move the codes for parsing msghdr and checking
sk into sctp_sendmsg_parse.

Note that different from before, 'sinfo' in sctp_sendmsg won't
be NULL any more. It gets the value either from cmsgs->srinfo,
cmsgs->sinfo or asoc. With it, the 'sinfo' and 'fill_sinfo_ttl'
check can be removed from sctp_sendmsg.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 172 ++++++++++++++++++++++--------------------------------
 1 file changed, 69 insertions(+), 103 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 68691d2b3167..bf089e59c792 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1606,6 +1606,61 @@ static int sctp_error(struct sock *sk, int flags, int err)
 static int sctp_msghdr_parse(const struct msghdr *msg,
 			     struct sctp_cmsgs *cmsgs);
 
+static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs,
+			      struct sctp_sndrcvinfo *srinfo,
+			      const struct msghdr *msg, size_t msg_len)
+{
+	__u16 sflags;
+	int err;
+
+	if (sctp_sstate(sk, LISTENING) && sctp_style(sk, TCP))
+		return -EPIPE;
+
+	if (msg_len > sk->sk_sndbuf)
+		return -EMSGSIZE;
+
+	memset(cmsgs, 0, sizeof(*cmsgs));
+	err = sctp_msghdr_parse(msg, cmsgs);
+	if (err) {
+		pr_debug("%s: msghdr parse err:%x\n", __func__, err);
+		return err;
+	}
+
+	memset(srinfo, 0, sizeof(*srinfo));
+	if (cmsgs->srinfo) {
+		srinfo->sinfo_stream = cmsgs->srinfo->sinfo_stream;
+		srinfo->sinfo_flags = cmsgs->srinfo->sinfo_flags;
+		srinfo->sinfo_ppid = cmsgs->srinfo->sinfo_ppid;
+		srinfo->sinfo_context = cmsgs->srinfo->sinfo_context;
+		srinfo->sinfo_assoc_id = cmsgs->srinfo->sinfo_assoc_id;
+		srinfo->sinfo_timetolive = cmsgs->srinfo->sinfo_timetolive;
+	}
+
+	if (cmsgs->sinfo) {
+		srinfo->sinfo_stream = cmsgs->sinfo->snd_sid;
+		srinfo->sinfo_flags = cmsgs->sinfo->snd_flags;
+		srinfo->sinfo_ppid = cmsgs->sinfo->snd_ppid;
+		srinfo->sinfo_context = cmsgs->sinfo->snd_context;
+		srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id;
+	}
+
+	sflags = srinfo->sinfo_flags;
+	if (!sflags && msg_len)
+		return 0;
+
+	if (sctp_style(sk, TCP) && (sflags & (SCTP_EOF | SCTP_ABORT)))
+		return -EINVAL;
+
+	if (((sflags & SCTP_EOF) && msg_len > 0) ||
+	    (!(sflags & (SCTP_EOF | SCTP_ABORT)) && msg_len == 0))
+		return -EINVAL;
+
+	if ((sflags & SCTP_ADDR_OVER) && !msg->msg_name)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 				 struct sctp_cmsgs *cmsgs,
 				 union sctp_addr *daddr,
@@ -1839,39 +1894,23 @@ static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk,
 
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
-	struct sctp_sock *sp;
-	struct sctp_endpoint *ep;
+	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
 	struct sctp_association *new_asoc = NULL, *asoc = NULL;
 	struct sctp_transport *transport, *chunk_tp;
-	struct sctp_sndrcvinfo default_sinfo;
-	struct sctp_sndrcvinfo *sinfo;
-	struct sctp_initmsg *sinit;
+	struct sctp_sndrcvinfo _sinfo, *sinfo;
 	sctp_assoc_t associd = 0;
 	struct sctp_cmsgs cmsgs = { NULL };
-	bool fill_sinfo_ttl = false;
 	__u16 sinfo_flags = 0;
 	union sctp_addr *daddr;
 	int err;
 
-	err = 0;
-	sp = sctp_sk(sk);
-	ep = sp->ep;
-
-	pr_debug("%s: sk:%p, msg:%p, msg_len:%zu ep:%p\n", __func__, sk,
-		 msg, msg_len, ep);
-
-	/* We cannot send a message over a TCP-style listening socket. */
-	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) {
-		err = -EPIPE;
+	/* Parse and get snd_info */
+	err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len);
+	if (err)
 		goto out_nounlock;
-	}
 
-	/* Parse out the SCTP CMSGs.  */
-	err = sctp_msghdr_parse(msg, &cmsgs);
-	if (err) {
-		pr_debug("%s: msghdr parse err:%x\n", __func__, err);
-		goto out_nounlock;
-	}
+	sinfo  = &_sinfo;
+	sinfo_flags = sinfo->sinfo_flags;
 
 	/* Get daddr from msg */
 	daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs);
@@ -1880,58 +1919,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		goto out_nounlock;
 	}
 
-	sinit = cmsgs.init;
-	if (cmsgs.sinfo != NULL) {
-		memset(&default_sinfo, 0, sizeof(default_sinfo));
-		default_sinfo.sinfo_stream = cmsgs.sinfo->snd_sid;
-		default_sinfo.sinfo_flags = cmsgs.sinfo->snd_flags;
-		default_sinfo.sinfo_ppid = cmsgs.sinfo->snd_ppid;
-		default_sinfo.sinfo_context = cmsgs.sinfo->snd_context;
-		default_sinfo.sinfo_assoc_id = cmsgs.sinfo->snd_assoc_id;
-
-		sinfo = &default_sinfo;
-		fill_sinfo_ttl = true;
-	} else {
-		sinfo = cmsgs.srinfo;
-	}
-	/* Did the user specify SNDINFO/SNDRCVINFO? */
-	if (sinfo) {
-		sinfo_flags = sinfo->sinfo_flags;
-		associd = sinfo->sinfo_assoc_id;
-	}
-
-	pr_debug("%s: msg_len:%zu, sinfo_flags:0x%x\n", __func__,
-		 msg_len, sinfo_flags);
-
-	/* SCTP_EOF or SCTP_ABORT cannot be set on a TCP-style socket. */
-	if (sctp_style(sk, TCP) && (sinfo_flags & (SCTP_EOF | SCTP_ABORT))) {
-		err = -EINVAL;
-		goto out_nounlock;
-	}
-
-	/* If SCTP_EOF is set, no data can be sent. Disallow sending zero
-	 * length messages when SCTP_EOF|SCTP_ABORT is not set.
-	 * If SCTP_ABORT is set, the message length could be non zero with
-	 * the msg_iov set to the user abort reason.
-	 */
-	if (((sinfo_flags & SCTP_EOF) && (msg_len > 0)) ||
-	    (!(sinfo_flags & (SCTP_EOF|SCTP_ABORT)) && (msg_len == 0))) {
-		err = -EINVAL;
-		goto out_nounlock;
-	}
-
-	/* If SCTP_ADDR_OVER is set, there must be an address
-	 * specified in msg_name.
-	 */
-	if ((sinfo_flags & SCTP_ADDR_OVER) && (!msg->msg_name)) {
-		err = -EINVAL;
-		goto out_nounlock;
-	}
-
-	transport = NULL;
-
-	pr_debug("%s: about to look up association\n", __func__);
-
 	lock_sock(sk);
 
 	/* If a msg_name has been specified, assume this is to be used.  */
@@ -1964,36 +1951,15 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		new_asoc = asoc;
 	}
 
-	/* ASSERT: we have a valid association at this point.  */
-	pr_debug("%s: we have a valid association\n", __func__);
-
-	if (!sinfo) {
-		/* If the user didn't specify SNDINFO/SNDRCVINFO, make up
-		 * one with some defaults.
-		 */
-		memset(&default_sinfo, 0, sizeof(default_sinfo));
-		default_sinfo.sinfo_stream = asoc->default_stream;
-		default_sinfo.sinfo_flags = asoc->default_flags;
-		default_sinfo.sinfo_ppid = asoc->default_ppid;
-		default_sinfo.sinfo_context = asoc->default_context;
-		default_sinfo.sinfo_timetolive = asoc->default_timetolive;
-		default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc);
-
-		sinfo = &default_sinfo;
-	} else if (fill_sinfo_ttl) {
-		/* In case SNDINFO was specified, we still need to fill
-		 * it with a default ttl from the assoc here.
-		 */
-		sinfo->sinfo_timetolive = asoc->default_timetolive;
+	if (!cmsgs.srinfo && !cmsgs.sinfo) {
+		sinfo->sinfo_stream = asoc->default_stream;
+		sinfo->sinfo_ppid = asoc->default_ppid;
+		sinfo->sinfo_context = asoc->default_context;
+		sinfo->sinfo_assoc_id = sctp_assoc2id(asoc);
 	}
 
-	/* API 7.1.7, the sndbuf size per association bounds the
-	 * maximum size of data that can be sent in a single send call.
-	 */
-	if (msg_len > sk->sk_sndbuf) {
-		err = -EMSGSIZE;
-		goto out_free;
-	}
+	if (!cmsgs.srinfo)
+		sinfo->sinfo_timetolive = asoc->default_timetolive;
 
 	/* If an address is passed with the sendto/sendmsg call, it is used
 	 * to override the primary destination address in the TCP model, or
-- 
cgit v1.2.3


From d42cb06e5b542cabea88c9074b2abf90d43757f7 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:15 +0800
Subject: sctp: factor out sctp_sendmsg_update_sinfo from sctp_sendmsg

This patch is to move the codes for trying to get sinfo from
asoc into sctp_sendmsg_update_sinfo.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bf089e59c792..bd1a657117f1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1892,6 +1892,21 @@ static union sctp_addr *sctp_sendmsg_get_daddr(struct sock *sk,
 	return daddr;
 }
 
+static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc,
+				      struct sctp_sndrcvinfo *sinfo,
+				      struct sctp_cmsgs *cmsgs)
+{
+	if (!cmsgs->srinfo && !cmsgs->sinfo) {
+		sinfo->sinfo_stream = asoc->default_stream;
+		sinfo->sinfo_ppid = asoc->default_ppid;
+		sinfo->sinfo_context = asoc->default_context;
+		sinfo->sinfo_assoc_id = sctp_assoc2id(asoc);
+	}
+
+	if (!cmsgs->srinfo)
+		sinfo->sinfo_timetolive = asoc->default_timetolive;
+}
+
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
 	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
@@ -1951,15 +1966,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		new_asoc = asoc;
 	}
 
-	if (!cmsgs.srinfo && !cmsgs.sinfo) {
-		sinfo->sinfo_stream = asoc->default_stream;
-		sinfo->sinfo_ppid = asoc->default_ppid;
-		sinfo->sinfo_context = asoc->default_context;
-		sinfo->sinfo_assoc_id = sctp_assoc2id(asoc);
-	}
-
-	if (!cmsgs.srinfo)
-		sinfo->sinfo_timetolive = asoc->default_timetolive;
+	/* Update snd_info with the asoc */
+	sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs);
 
 	/* If an address is passed with the sendto/sendmsg call, it is used
 	 * to override the primary destination address in the TCP model, or
-- 
cgit v1.2.3


From 8e87c6eb18f9d7427054f28a284d495c174d9970 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:16 +0800
Subject: sctp: remove the unnecessary transport looking up from sctp_sendmsg

Now sctp_assoc_lookup_paddr can only be called only if daddr has
been set. But if daddr has been set, sctp_endpoint_lookup_assoc
would be done, where it could already have the transport.

So this unnecessary transport looking up should be removed, but
only reset transport as NULL when SCTP_ADDR_OVER is not set for
UDP type socket.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index bd1a657117f1..426031095afe 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1911,7 +1911,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
 	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
 	struct sctp_association *new_asoc = NULL, *asoc = NULL;
-	struct sctp_transport *transport, *chunk_tp;
+	struct sctp_transport *transport = NULL;
 	struct sctp_sndrcvinfo _sinfo, *sinfo;
 	sctp_assoc_t associd = 0;
 	struct sctp_cmsgs cmsgs = { NULL };
@@ -1966,29 +1966,17 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		new_asoc = asoc;
 	}
 
+	if (!sctp_style(sk, TCP) && !(sinfo_flags & SCTP_ADDR_OVER))
+		transport = NULL;
+
 	/* Update snd_info with the asoc */
 	sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs);
 
-	/* If an address is passed with the sendto/sendmsg call, it is used
-	 * to override the primary destination address in the TCP model, or
-	 * when SCTP_ADDR_OVER flag is set in the UDP model.
-	 */
-	if ((sctp_style(sk, TCP) && daddr) ||
-	    (sinfo_flags & SCTP_ADDR_OVER)) {
-		chunk_tp = sctp_assoc_lookup_paddr(asoc, daddr);
-		if (!chunk_tp) {
-			err = -EINVAL;
-			goto out_free;
-		}
-	} else
-		chunk_tp = NULL;
-
 	/* Send msg to the asoc */
-	err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, chunk_tp, sinfo);
-
-out_free:
+	err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo);
 	if (err < 0 && err != -ESRCH && new_asoc)
 		sctp_association_free(asoc);
+
 out_unlock:
 	release_sock(sk);
 out_nounlock:
-- 
cgit v1.2.3


From 007b7e18be74a49b61f89664966ac1477e1c9608 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:17 +0800
Subject: sctp: improve some variables in sctp_sendmsg

This patch mostly is to:

  - rename sinfo_flags as sflags, to make the indents look better, and
    also keep consistent with other sctp_sendmsg_xx functions.

  - replace new_asoc with bool new, no need to define a pointer here,
    as if new_asoc is set, it must be asoc.

  - rename the 'out_nounlock:' as 'out', shorter and nicer.

  - remove associd, only one place is using it now, just use
    sinfo->sinfo_assoc_id directly.

  - remove 'cmsgs' initialization in sctp_sendmsg, as it will be done
    in sctp_sendmsg_parse.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 426031095afe..a1c78fc19d95 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1910,28 +1910,28 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc,
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 {
 	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
-	struct sctp_association *new_asoc = NULL, *asoc = NULL;
 	struct sctp_transport *transport = NULL;
 	struct sctp_sndrcvinfo _sinfo, *sinfo;
-	sctp_assoc_t associd = 0;
-	struct sctp_cmsgs cmsgs = { NULL };
-	__u16 sinfo_flags = 0;
+	struct sctp_association *asoc;
+	struct sctp_cmsgs cmsgs;
 	union sctp_addr *daddr;
+	bool new = false;
+	__u16 sflags;
 	int err;
 
 	/* Parse and get snd_info */
 	err = sctp_sendmsg_parse(sk, &cmsgs, &_sinfo, msg, msg_len);
 	if (err)
-		goto out_nounlock;
+		goto out;
 
 	sinfo  = &_sinfo;
-	sinfo_flags = sinfo->sinfo_flags;
+	sflags = sinfo->sinfo_flags;
 
 	/* Get daddr from msg */
 	daddr = sctp_sendmsg_get_daddr(sk, msg, &cmsgs);
 	if (IS_ERR(daddr)) {
 		err = PTR_ERR(daddr);
-		goto out_nounlock;
+		goto out;
 	}
 
 	lock_sock(sk);
@@ -1941,7 +1941,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 		/* Look for a matching association on the endpoint. */
 		asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
 	} else {
-		asoc = sctp_id2assoc(sk, associd);
+		asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id);
 		if (!asoc) {
 			err = -EPIPE;
 			goto out_unlock;
@@ -1949,24 +1949,23 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 	}
 
 	if (asoc) {
-		err = sctp_sendmsg_check_sflags(asoc, sinfo_flags, msg,
-						msg_len);
+		err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len);
 		if (err <= 0)
 			goto out_unlock;
 	}
 
 	/* Do we need to create the association?  */
 	if (!asoc) {
-		err = sctp_sendmsg_new_asoc(sk, sinfo_flags, &cmsgs, daddr,
+		err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr,
 					    &transport);
 		if (err)
 			goto out_unlock;
 
 		asoc = transport->asoc;
-		new_asoc = asoc;
+		new = true;
 	}
 
-	if (!sctp_style(sk, TCP) && !(sinfo_flags & SCTP_ADDR_OVER))
+	if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER))
 		transport = NULL;
 
 	/* Update snd_info with the asoc */
@@ -1974,12 +1973,12 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	/* Send msg to the asoc */
 	err = sctp_sendmsg_to_asoc(asoc, msg, msg_len, transport, sinfo);
-	if (err < 0 && err != -ESRCH && new_asoc)
+	if (err < 0 && err != -ESRCH && new)
 		sctp_association_free(asoc);
 
 out_unlock:
 	release_sock(sk);
-out_nounlock:
+out:
 	return sctp_error(sk, msg->msg_flags, err);
 }
 
-- 
cgit v1.2.3


From 0a3920d28b5490e6c90110156135dbc12c7fc413 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 1 Mar 2018 23:05:18 +0800
Subject: sctp: adjust some codes in a better order in sctp_sendmsg

sctp_sendmsg_new_asoc and SCTP_ADDR_OVER check is only necessary
when daddr is set, so move them up to if (daddr) statement.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index a1c78fc19d95..7fa76031bb08 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1936,38 +1936,38 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	lock_sock(sk);
 
-	/* If a msg_name has been specified, assume this is to be used.  */
+	/* Get and check or create asoc */
 	if (daddr) {
-		/* Look for a matching association on the endpoint. */
 		asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
+		if (asoc) {
+			err = sctp_sendmsg_check_sflags(asoc, sflags, msg,
+							msg_len);
+			if (err <= 0)
+				goto out_unlock;
+		} else {
+			err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr,
+						    &transport);
+			if (err)
+				goto out_unlock;
+
+			asoc = transport->asoc;
+			new = true;
+		}
+
+		if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER))
+			transport = NULL;
 	} else {
 		asoc = sctp_id2assoc(sk, sinfo->sinfo_assoc_id);
 		if (!asoc) {
 			err = -EPIPE;
 			goto out_unlock;
 		}
-	}
 
-	if (asoc) {
 		err = sctp_sendmsg_check_sflags(asoc, sflags, msg, msg_len);
 		if (err <= 0)
 			goto out_unlock;
 	}
 
-	/* Do we need to create the association?  */
-	if (!asoc) {
-		err = sctp_sendmsg_new_asoc(sk, sflags, &cmsgs, daddr,
-					    &transport);
-		if (err)
-			goto out_unlock;
-
-		asoc = transport->asoc;
-		new = true;
-	}
-
-	if (!sctp_style(sk, TCP) && !(sflags & SCTP_ADDR_OVER))
-		transport = NULL;
-
 	/* Update snd_info with the asoc */
 	sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs);
 
-- 
cgit v1.2.3


From 7efc0b6b666d757e07417f59397e7f5f340e74e0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:12 -0800
Subject: net/ipv4: Pass net to fib_multipath_hash instead of fib_info

fib_multipath_hash only needs net struct to check a sysctl. Make it
clear by passing net instead of fib_info. In the end this allows
alignment between the ipv4 and ipv6 versions.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 2 +-
 net/ipv4/route.c         | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 181b0d8d589c..e7c602c600ac 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1770,7 +1770,7 @@ void fib_select_path(struct net *net, struct fib_result *res,
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi->fib_nhs > 1) {
-		int h = fib_multipath_hash(res->fi, fl4, skb, NULL);
+		int h = fib_multipath_hash(net, fl4, skb, NULL);
 
 		fib_select_multipath(res, h);
 	}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3bb686dac273..1689c569bbc3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1782,10 +1782,9 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
 }
 
 /* if skb is set it will be used and fl4 can be NULL */
-int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
+int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 		       const struct sk_buff *skb, struct flow_keys *flkeys)
 {
-	struct net *net = fi->fib_net;
 	struct flow_keys hash_keys;
 	u32 mhash;
 
@@ -1852,7 +1851,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (res->fi && res->fi->fib_nhs > 1) {
-		int h = fib_multipath_hash(res->fi, NULL, skb, hkeys);
+		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
 
 		fib_select_multipath(res, h);
 	}
-- 
cgit v1.2.3


From 6f74b6c259158d89ad258cda8e86240e01052884 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:13 -0800
Subject: net: Align ip_multipath_l3_keys and ip6_multipath_l3_keys

Symmetry is good and allows easy comparison that ipv4 and ipv6 are
doing the same thing. To that end, change ip_multipath_l3_keys to
set addresses at the end after the icmp compares, and move the
initialization of ipv6 flow keys to rt6_multipath_hash.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 20 +++++++++++---------
 net/ipv6/route.c |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1689c569bbc3..df57bc68e8e0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1748,37 +1748,39 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
 				 struct flow_keys *hash_keys)
 {
 	const struct iphdr *outer_iph = ip_hdr(skb);
+	const struct iphdr *key_iph = outer_iph;
 	const struct iphdr *inner_iph;
 	const struct icmphdr *icmph;
 	struct iphdr _inner_iph;
 	struct icmphdr _icmph;
 
-	hash_keys->addrs.v4addrs.src = outer_iph->saddr;
-	hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
 	if (likely(outer_iph->protocol != IPPROTO_ICMP))
-		return;
+		goto out;
 
 	if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
-		return;
+		goto out;
 
 	icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
 				   &_icmph);
 	if (!icmph)
-		return;
+		goto out;
 
 	if (icmph->type != ICMP_DEST_UNREACH &&
 	    icmph->type != ICMP_REDIRECT &&
 	    icmph->type != ICMP_TIME_EXCEEDED &&
 	    icmph->type != ICMP_PARAMETERPROB)
-		return;
+		goto out;
 
 	inner_iph = skb_header_pointer(skb,
 				       outer_iph->ihl * 4 + sizeof(_icmph),
 				       sizeof(_inner_iph), &_inner_iph);
 	if (!inner_iph)
-		return;
-	hash_keys->addrs.v4addrs.src = inner_iph->saddr;
-	hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
+		goto out;
+
+	key_iph = inner_iph;
+out:
+	hash_keys->addrs.v4addrs.src = key_iph->saddr;
+	hash_keys->addrs.v4addrs.dst = key_iph->daddr;
 }
 
 /* if skb is set it will be used and fl4 can be NULL */
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e2bb40824c85..190d9690dfe0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1815,8 +1815,6 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
 	key_iph = inner_iph;
 	_flkeys = NULL;
 out:
-	memset(keys, 0, sizeof(*keys));
-	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 	if (_flkeys) {
 		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
 		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
@@ -1837,6 +1835,8 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
 	struct flow_keys hash_keys;
 
 	if (skb) {
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 		ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
 		return flow_hash_from_keys(&hash_keys) >> 1;
 	}
-- 
cgit v1.2.3


From ec7127a5d53d891b77a11433026dca72c57eaf88 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:14 -0800
Subject: net/ipv4: Simplify fib_multipath_hash with optional flow keys

As of commit e37b1e978bec5 ("ipv6: route: dissect flow in input path if
fib rules need it") fib_multipath_hash takes an optional flow keys. If
non-NULL it means the skb has already been dissected. If not set, then
fib_multipath_hash needs to call skb_flow_dissect_flow_keys.

Simplify the logic by setting flkeys to the local stack variable keys.
Simplifies fib_multipath_hash by only have 1 set of instructions
setting hash_keys.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index df57bc68e8e0..6a7b3cba3972 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1810,24 +1810,20 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
 			/* short-circuit if we already have L4 hash present */
 			if (skb->l4_hash)
 				return skb_get_hash_raw(skb) >> 1;
+
 			memset(&hash_keys, 0, sizeof(hash_keys));
 
-			if (flkeys) {
-				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-				hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
-				hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
-				hash_keys.ports.src = flkeys->ports.src;
-				hash_keys.ports.dst = flkeys->ports.dst;
-				hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
-			} else {
+			if (!flkeys) {
 				skb_flow_dissect_flow_keys(skb, &keys, flag);
-				hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-				hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
-				hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
-				hash_keys.ports.src = keys.ports.src;
-				hash_keys.ports.dst = keys.ports.dst;
-				hash_keys.basic.ip_proto = keys.basic.ip_proto;
+				flkeys = &keys;
 			}
+
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+			hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+			hash_keys.ports.src = flkeys->ports.src;
+			hash_keys.ports.dst = flkeys->ports.dst;
+			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
 		} else {
 			memset(&hash_keys, 0, sizeof(hash_keys));
 			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-- 
cgit v1.2.3


From 9a2a537acc75dfd19c4358bc9cb6042bdc60698c Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:15 -0800
Subject: net/ipv6: Make rt6_multipath_hash similar to fib_multipath_hash

Make rt6_multipath_hash more of a direct parallel to fib_multipath_hash
and reduce stack and overhead in the process: get_hash_from_flowi6 is
just a wrapper around __get_hash_from_flowi6 with another stack
allocation for flow_keys. Move setting the addresses, protocol and
label into rt6_multipath_hash and allow it to make the call to
flow_hash_from_keys.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 190d9690dfe0..5c89af2c54f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1833,15 +1833,21 @@ u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
 		       struct flow_keys *flkeys)
 {
 	struct flow_keys hash_keys;
+	u32 mhash;
 
+	memset(&hash_keys, 0, sizeof(hash_keys));
+	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 	if (skb) {
-		memset(&hash_keys, 0, sizeof(hash_keys));
-		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 		ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
-		return flow_hash_from_keys(&hash_keys) >> 1;
+	} else {
+		hash_keys.addrs.v6addrs.src = fl6->saddr;
+		hash_keys.addrs.v6addrs.dst = fl6->daddr;
+		hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+		hash_keys.basic.ip_proto = fl6->flowi6_proto;
 	}
+	mhash = flow_hash_from_keys(&hash_keys);
 
-	return get_hash_from_flowi6(fl6) >> 1;
+	return mhash >> 1;
 }
 
 void ip6_route_input(struct sk_buff *skb)
-- 
cgit v1.2.3


From 3192dac64c73d8c0eb4274a3da23d829fb5177af Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:16 -0800
Subject: net: Rename NETEVENT_MULTIPATH_HASH_UPDATE

Rename NETEVENT_MULTIPATH_HASH_UPDATE to
NETEVENT_IPV4_MPATH_HASH_UPDATE to denote it relates to a change
in the IPv4 hash policy.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 89683d868b37..011de9a20ec6 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -400,7 +400,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
 
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (write && ret == 0)
-		call_netevent_notifiers(NETEVENT_MULTIPATH_HASH_UPDATE, net);
+		call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
 
 	return ret;
 }
-- 
cgit v1.2.3


From b75cc8f90f07342467b3bd51dbc0054f185032c9 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:17 -0800
Subject: net/ipv6: Pass skb to route lookup

IPv6 does path selection for multipath routes deep in the lookup
functions. The next patch adds L4 hash option and needs the skb
for the forward path. To get the skb to the relevant FIB lookup
functions it needs to go through the fib rules layer, so add a
lookup_data argument to the fib_lookup_arg struct.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/anycast.c                 |  2 +-
 net/ipv6/fib6_rules.c              |  8 +++--
 net/ipv6/icmp.c                    |  3 +-
 net/ipv6/ip6_fib.c                 |  3 +-
 net/ipv6/ip6_gre.c                 |  2 +-
 net/ipv6/ip6_tunnel.c              |  4 +--
 net/ipv6/ip6_vti.c                 |  2 +-
 net/ipv6/mcast.c                   |  4 +--
 net/ipv6/netfilter/ip6t_rpfilter.c |  2 +-
 net/ipv6/netfilter/nft_fib_ipv6.c  |  3 +-
 net/ipv6/route.c                   | 72 +++++++++++++++++++++++---------------
 net/ipv6/seg6_local.c              |  4 +--
 12 files changed, 65 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index d7d0abc7fd0e..c61718dba2e6 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -78,7 +78,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 	if (ifindex == 0) {
 		struct rt6_info *rt;
 
-		rt = rt6_lookup(net, addr, NULL, 0, 0);
+		rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
 		if (rt) {
 			dev = rt->dst.dev;
 			ip6_rt_put(rt);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 04e5f523e50f..00ef9467f3c0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -61,11 +61,13 @@ unsigned int fib6_rules_seq_read(struct net *net)
 }
 
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
 {
 	if (net->ipv6.fib6_has_custom_rules) {
 		struct fib_lookup_arg arg = {
 			.lookup_ptr = lookup,
+			.lookup_data = skb,
 			.flags = FIB_LOOKUP_NOREF,
 		};
 
@@ -80,11 +82,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
 	} else {
 		struct rt6_info *rt;
 
-		rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, flags);
+		rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
 		if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
 			return &rt->dst;
 		ip6_rt_put(rt);
-		rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+		rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
 		if (rt->dst.error != -EAGAIN)
 			return &rt->dst;
 		ip6_rt_put(rt);
@@ -130,7 +132,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
 		goto out;
 	}
 
-	rt = lookup(net, table, flp6, flags);
+	rt = lookup(net, table, flp6, arg->lookup_data, flags);
 	if (rt != net->ipv6.ip6_null_entry) {
 		struct fib6_rule *r = (struct fib6_rule *)rule;
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index b0778d323b6e..a5d929223820 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -629,7 +629,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
 	skb_pull(skb2, nhs);
 	skb_reset_network_header(skb2);
 
-	rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, 0);
+	rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0,
+			skb, 0);
 
 	if (rt && rt->dst.dev)
 		skb2->dev = rt->dst.dev;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index cab95cf3b39f..2f995e9e3050 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -299,11 +299,12 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id)
 }
 
 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+				   const struct sk_buff *skb,
 				   int flags, pol_lookup_t lookup)
 {
 	struct rt6_info *rt;
 
-	rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+	rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
 	if (rt->dst.error == -EAGAIN) {
 		ip6_rt_put(rt);
 		rt = net->ipv6.ip6_null_entry;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 4f150a394387..83c7766c8c75 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1053,7 +1053,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu)
 
 		struct rt6_info *rt = rt6_lookup(t->net,
 						 &p->raddr, &p->laddr,
-						 p->link, strict);
+						 p->link, NULL, strict);
 
 		if (!rt)
 			return;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 869e2e6750f7..1124f310df5a 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -679,7 +679,7 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 
 		/* Try to guess incoming interface */
 		rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr,
-				NULL, 0, 0);
+				NULL, 0, skb2, 0);
 
 		if (rt && rt->dst.dev)
 			skb2->dev = rt->dst.dev;
@@ -1444,7 +1444,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
 
 		struct rt6_info *rt = rt6_lookup(t->net,
 						 &p->raddr, &p->laddr,
-						 p->link, strict);
+						 p->link, NULL, strict);
 
 		if (!rt)
 			return;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index c617ea17faa8..a482b854eeea 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -645,7 +645,7 @@ static void vti6_link_config(struct ip6_tnl *t)
 			      (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
 		struct rt6_info *rt = rt6_lookup(t->net,
 						 &p->raddr, &p->laddr,
-						 p->link, strict);
+						 p->link, NULL, strict);
 
 		if (rt)
 			tdev = rt->dst.dev;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index d9bb933dd5c4..d1a0cefac273 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -165,7 +165,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 
 	if (ifindex == 0) {
 		struct rt6_info *rt;
-		rt = rt6_lookup(net, addr, NULL, 0, 0);
+		rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
 		if (rt) {
 			dev = rt->dst.dev;
 			ip6_rt_put(rt);
@@ -254,7 +254,7 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
 	struct inet6_dev *idev = NULL;
 
 	if (ifindex == 0) {
-		struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, 0);
+		struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
 
 		if (rt) {
 			dev = rt->dst.dev;
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index 94deb69bbbda..910a27318f58 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -53,7 +53,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
 		lookup_flags |= RT6_LOOKUP_F_IFACE;
 	}
 
-	rt = (void *) ip6_route_lookup(net, &fl6, lookup_flags);
+	rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags);
 	if (rt->dst.error)
 		goto out;
 
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index cc5174c7254c..3230b3d7b11b 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -181,7 +181,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 
 	*dest = 0;
  again:
-	rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, lookup_flags);
+	rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb,
+				      lookup_flags);
 	if (rt->dst.error)
 		goto put_rt_err;
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5c89af2c54f4..d2b8368663cb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -452,6 +452,7 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 
 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 					     struct flowi6 *fl6, int oif,
+					     const struct sk_buff *skb,
 					     int strict)
 {
 	struct rt6_info *sibling, *next_sibling;
@@ -460,7 +461,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	 * case it will always be non-zero. Otherwise now is the time to do it.
 	 */
 	if (!fl6->mp_hash)
-		fl6->mp_hash = rt6_multipath_hash(fl6, NULL, NULL);
+		fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL);
 
 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
 		return match;
@@ -914,7 +915,9 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
 
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     struct fib6_table *table,
-					     struct flowi6 *fl6, int flags)
+					     struct flowi6 *fl6,
+					     const struct sk_buff *skb,
+					     int flags)
 {
 	struct rt6_info *rt, *rt_cache;
 	struct fib6_node *fn;
@@ -929,8 +932,8 @@ restart:
 		rt = rt6_device_match(net, rt, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-			rt = rt6_multipath_select(rt, fl6,
-						  fl6->flowi6_oif, flags);
+			rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif,
+						  skb, flags);
 	}
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
@@ -954,14 +957,15 @@ restart:
 }
 
 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
-				    int flags)
+				   const struct sk_buff *skb, int flags)
 {
-	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
+	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
 }
 EXPORT_SYMBOL_GPL(ip6_route_lookup);
 
 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
-			    const struct in6_addr *saddr, int oif, int strict)
+			    const struct in6_addr *saddr, int oif,
+			    const struct sk_buff *skb, int strict)
 {
 	struct flowi6 fl6 = {
 		.flowi6_oif = oif,
@@ -975,7 +979,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 		flags |= RT6_LOOKUP_F_HAS_SADDR;
 	}
 
-	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
+	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
 	if (dst->error == 0)
 		return (struct rt6_info *) dst;
 
@@ -1647,7 +1651,8 @@ void rt6_age_exceptions(struct rt6_info *rt,
 }
 
 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
-			       int oif, struct flowi6 *fl6, int flags)
+			       int oif, struct flowi6 *fl6,
+			       const struct sk_buff *skb, int flags)
 {
 	struct fib6_node *fn, *saved_fn;
 	struct rt6_info *rt, *rt_cache;
@@ -1669,7 +1674,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	rt = rt6_select(net, fn, oif, strict);
 	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(rt, fl6, oif, strict);
+		rt = rt6_multipath_select(rt, fl6, oif, skb, strict);
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1768,20 +1773,25 @@ uncached_rt_out:
 }
 EXPORT_SYMBOL_GPL(ip6_pol_route);
 
-static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
-					    struct flowi6 *fl6, int flags)
+static struct rt6_info *ip6_pol_route_input(struct net *net,
+					    struct fib6_table *table,
+					    struct flowi6 *fl6,
+					    const struct sk_buff *skb,
+					    int flags)
 {
-	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
+	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
 }
 
 struct dst_entry *ip6_route_input_lookup(struct net *net,
 					 struct net_device *dev,
-					 struct flowi6 *fl6, int flags)
+					 struct flowi6 *fl6,
+					 const struct sk_buff *skb,
+					 int flags)
 {
 	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
 		flags |= RT6_LOOKUP_F_IFACE;
 
-	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
+	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
 }
 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
@@ -1876,13 +1886,17 @@ void ip6_route_input(struct sk_buff *skb)
 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
 		fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys);
 	skb_dst_drop(skb);
-	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
+	skb_dst_set(skb,
+		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
 }
 
-static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
-					     struct flowi6 *fl6, int flags)
+static struct rt6_info *ip6_pol_route_output(struct net *net,
+					     struct fib6_table *table,
+					     struct flowi6 *fl6,
+					     const struct sk_buff *skb,
+					     int flags)
 {
-	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
+	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
 }
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
@@ -1910,7 +1924,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
 	else if (sk)
 		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 
-	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
+	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
 }
 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
 
@@ -2159,6 +2173,7 @@ struct ip6rd_flowi {
 static struct rt6_info *__ip6_route_redirect(struct net *net,
 					     struct fib6_table *table,
 					     struct flowi6 *fl6,
+					     const struct sk_buff *skb,
 					     int flags)
 {
 	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
@@ -2232,8 +2247,9 @@ out:
 };
 
 static struct dst_entry *ip6_route_redirect(struct net *net,
-					const struct flowi6 *fl6,
-					const struct in6_addr *gateway)
+					    const struct flowi6 *fl6,
+					    const struct sk_buff *skb,
+					    const struct in6_addr *gateway)
 {
 	int flags = RT6_LOOKUP_F_HAS_SADDR;
 	struct ip6rd_flowi rdfl;
@@ -2241,7 +2257,7 @@ static struct dst_entry *ip6_route_redirect(struct net *net,
 	rdfl.fl6 = *fl6;
 	rdfl.gateway = *gateway;
 
-	return fib6_rule_lookup(net, &rdfl.fl6,
+	return fib6_rule_lookup(net, &rdfl.fl6, skb,
 				flags, __ip6_route_redirect);
 }
 
@@ -2261,7 +2277,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
 	fl6.flowlabel = ip6_flowinfo(iph);
 	fl6.flowi6_uid = uid;
 
-	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
+	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
 	rt6_do_redirect(dst, NULL, skb);
 	dst_release(dst);
 }
@@ -2283,7 +2299,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
 	fl6.saddr = iph->daddr;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
 
-	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
+	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
 	rt6_do_redirect(dst, NULL, skb);
 	dst_release(dst);
 }
@@ -2485,7 +2501,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 		flags |= RT6_LOOKUP_F_HAS_SADDR;
 
 	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
-	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
+	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
 
 	/* if table lookup failed, fall back to full lookup */
 	if (rt == net->ipv6.ip6_null_entry) {
@@ -2548,7 +2564,7 @@ static int ip6_route_check_nh(struct net *net,
 	}
 
 	if (!grt)
-		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
+		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
 
 	if (!grt)
 		goto out;
@@ -4613,7 +4629,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 		if (!ipv6_addr_any(&fl6.saddr))
 			flags |= RT6_LOOKUP_F_HAS_SADDR;
 
-		dst = ip6_route_input_lookup(net, dev, &fl6, flags);
+		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
 
 		rcu_read_unlock();
 	} else {
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index ba3767ef5e93..45722327375a 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -161,7 +161,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 		fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
 
 	if (!tbl_id) {
-		dst = ip6_route_input_lookup(net, skb->dev, &fl6, flags);
+		dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags);
 	} else {
 		struct fib6_table *table;
 
@@ -169,7 +169,7 @@ static void lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
 		if (!table)
 			goto out;
 
-		rt = ip6_pol_route(net, table, 0, &fl6, flags);
+		rt = ip6_pol_route(net, table, 0, &fl6, skb, flags);
 		dst = &rt->dst;
 	}
 
-- 
cgit v1.2.3


From b4bac172e90ce4a93df8adf44eb70d91b9d611eb Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:18 -0800
Subject: net/ipv6: Add support for path selection using hash of 5-tuple

Some operators prefer IPv6 path selection to use a standard 5-tuple
hash rather than just an L3 hash with the flow the label. To that end
add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb
("net: ipv4: add support for ECMP hash policy choice"). The default
is still L3 which covers source and destination addresses along with
flow label and IPv6 protocol.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/icmp.c            |  2 +-
 net/ipv6/route.c           | 68 +++++++++++++++++++++++++++++++++++-----------
 net/ipv6/sysctl_net_ipv6.c | 27 ++++++++++++++++++
 3 files changed, 80 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index a5d929223820..6f84668be6ea 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	fl6.fl6_icmp_type = type;
 	fl6.fl6_icmp_code = code;
 	fl6.flowi6_uid = sock_net_uid(net, NULL);
-	fl6.mp_hash = rt6_multipath_hash(&fl6, skb, NULL);
+	fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
 	security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
 	sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d2b8368663cb..f0ae58424c45 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt)
 	return false;
 }
 
-static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+static struct rt6_info *rt6_multipath_select(const struct net *net,
+					     struct rt6_info *match,
 					     struct flowi6 *fl6, int oif,
 					     const struct sk_buff *skb,
 					     int strict)
@@ -461,7 +462,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 	 * case it will always be non-zero. Otherwise now is the time to do it.
 	 */
 	if (!fl6->mp_hash)
-		fl6->mp_hash = rt6_multipath_hash(fl6, skb, NULL);
+		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
 
 	if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
 		return match;
@@ -932,7 +933,7 @@ restart:
 		rt = rt6_device_match(net, rt, &fl6->saddr,
 				      fl6->flowi6_oif, flags);
 		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-			rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif,
+			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
 						  skb, flags);
 	}
 	if (rt == net->ipv6.ip6_null_entry) {
@@ -1674,7 +1675,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 redo_rt6_select:
 	rt = rt6_select(net, fn, oif, strict);
 	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(rt, fl6, oif, skb, strict);
+		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
@@ -1839,21 +1840,56 @@ out:
 }
 
 /* if skb is set it will be used and fl6 can be NULL */
-u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb,
-		       struct flow_keys *flkeys)
+u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
+		       const struct sk_buff *skb, struct flow_keys *flkeys)
 {
 	struct flow_keys hash_keys;
 	u32 mhash;
 
-	memset(&hash_keys, 0, sizeof(hash_keys));
-	hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-	if (skb) {
-		ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
-	} else {
-		hash_keys.addrs.v6addrs.src = fl6->saddr;
-		hash_keys.addrs.v6addrs.dst = fl6->daddr;
-		hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
-		hash_keys.basic.ip_proto = fl6->flowi6_proto;
+	switch (net->ipv6.sysctl.multipath_hash_policy) {
+	case 0:
+		memset(&hash_keys, 0, sizeof(hash_keys));
+		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+		if (skb) {
+			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
+		} else {
+			hash_keys.addrs.v6addrs.src = fl6->saddr;
+			hash_keys.addrs.v6addrs.dst = fl6->daddr;
+			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+			hash_keys.basic.ip_proto = fl6->flowi6_proto;
+		}
+		break;
+	case 1:
+		if (skb) {
+			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+			struct flow_keys keys;
+
+			/* short-circuit if we already have L4 hash present */
+			if (skb->l4_hash)
+				return skb_get_hash_raw(skb) >> 1;
+
+			memset(&hash_keys, 0, sizeof(hash_keys));
+
+                        if (!flkeys) {
+				skb_flow_dissect_flow_keys(skb, &keys, flag);
+				flkeys = &keys;
+			}
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
+			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
+			hash_keys.ports.src = flkeys->ports.src;
+			hash_keys.ports.dst = flkeys->ports.dst;
+			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+		} else {
+			memset(&hash_keys, 0, sizeof(hash_keys));
+			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+			hash_keys.addrs.v6addrs.src = fl6->saddr;
+			hash_keys.addrs.v6addrs.dst = fl6->daddr;
+			hash_keys.ports.src = fl6->fl6_sport;
+			hash_keys.ports.dst = fl6->fl6_dport;
+			hash_keys.basic.ip_proto = fl6->flowi6_proto;
+		}
+		break;
 	}
 	mhash = flow_hash_from_keys(&hash_keys);
 
@@ -1884,7 +1920,7 @@ void ip6_route_input(struct sk_buff *skb)
 		flkeys = &_flkeys;
 
 	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
-		fl6.mp_hash = rt6_multipath_hash(&fl6, skb, flkeys);
+		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
 	skb_dst_drop(skb);
 	skb_dst_set(skb,
 		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 262f791f1b9b..966c42af92f4 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -16,14 +16,31 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/inet_frag.h>
+#include <net/netevent.h>
 #ifdef CONFIG_NETLABEL
 #include <net/calipso.h>
 #endif
 
+static int zero;
 static int one = 1;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
 
+static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
+					  void __user *buffer, size_t *lenp,
+					  loff_t *ppos)
+{
+	struct net *net;
+	int ret;
+
+	net = container_of(table->data, struct net,
+			   ipv6.sysctl.multipath_hash_policy);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (write && ret == 0)
+		call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
+
+	return ret;
+}
 
 static struct ctl_table ipv6_table_template[] = {
 	{
@@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "fib_multipath_hash_policy",
+		.data		= &init_net.ipv6.sysctl.multipath_hash_policy,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler   = proc_rt6_multipath_hash_policy,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };
 
@@ -190,6 +216,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
 	ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
 	ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
 	ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
+	ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
 
 	ipv6_route_table = ipv6_route_sysctl_init(net);
 	if (!ipv6_route_table)
-- 
cgit v1.2.3


From de7a0f871fabe74fff7481caf7d3efe03b58fe58 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Fri, 2 Mar 2018 08:32:20 -0800
Subject: net: Remove unused get_hash_from_flow functions

__get_hash_from_flowi6 is still used for flowlabels, but the IPv4
variant and the wrappers to both are not used. Remove them.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'net')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 559db9ea8d86..d29f09bc5ff9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1341,22 +1341,6 @@ __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
 }
 EXPORT_SYMBOL(__get_hash_from_flowi6);
 
-__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys)
-{
-	memset(keys, 0, sizeof(*keys));
-
-	keys->addrs.v4addrs.src = fl4->saddr;
-	keys->addrs.v4addrs.dst = fl4->daddr;
-	keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-	keys->ports.src = fl4->fl4_sport;
-	keys->ports.dst = fl4->fl4_dport;
-	keys->keyid.keyid = fl4->fl4_gre_key;
-	keys->basic.ip_proto = fl4->flowi4_proto;
-
-	return flow_hash_from_keys(keys);
-}
-EXPORT_SYMBOL(__get_hash_from_flowi4);
-
 static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 	{
 		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
-- 
cgit v1.2.3


From 88c060549a4c555d59965801d1e811b71614c2b7 Mon Sep 17 00:00:00 2001
From: Andrew Lunn <andrew@lunn.ch>
Date: Thu, 1 Mar 2018 02:02:27 +0100
Subject: dsa: Pass the port to get_sset_count()

By passing the port, we allow different ports to have different
statistics. This is useful since some ports have SERDES interfaces
with their own statistic counters.

Signed-off-by: Andrew Lunn <andrew@lunn.ch>
Tested-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dsa/master.c | 4 ++--
 net/dsa/slave.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/dsa/master.c b/net/dsa/master.c
index 00589147f042..90e6df0351eb 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -42,7 +42,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
 		count += ops->get_sset_count(dev, sset);
 
 	if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
-		count += ds->ops->get_sset_count(ds);
+		count += ds->ops->get_sset_count(ds, cpu_dp->index);
 
 	return count;
 }
@@ -76,7 +76,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
 		 * constructed earlier
 		 */
 		ds->ops->get_strings(ds, port, ndata);
-		count = ds->ops->get_sset_count(ds);
+		count = ds->ops->get_sset_count(ds, port);
 		for (i = 0; i < count; i++) {
 			memmove(ndata + (i * len + sizeof(pfx)),
 				ndata + i * len, len - sizeof(pfx));
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3376dad6dcfd..18561af7a8f1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -605,7 +605,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
 
 		count = 4;
 		if (ds->ops->get_sset_count)
-			count += ds->ops->get_sset_count(ds);
+			count += ds->ops->get_sset_count(ds, dp->index);
 
 		return count;
 	}
-- 
cgit v1.2.3


From 6b2536039f653e35c96e7461a7456d2246331462 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@c0d3.blue>
Date: Sun, 4 Mar 2018 21:02:18 +0100
Subject: batman-adv: Avoid redundant multicast TT entries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a node signals that it wants all traffic for a specific protocol
family then there is no need to announce individual multicast addresses
via TT.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/multicast.c | 56 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 6eaffe50335a..c7a1305ca7e7 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -101,8 +101,37 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
 	return upper;
 }
 
+/**
+ * batadv_mcast_addr_is_ipv4() - check if multicast MAC is IPv4
+ * @addr: the MAC address to check
+ *
+ * Return: True, if MAC address is one reserved for IPv4 multicast, false
+ * otherwise.
+ */
+static bool batadv_mcast_addr_is_ipv4(const u8 *addr)
+{
+	static const u8 prefix[] = {0x01, 0x00, 0x5E};
+
+	return memcmp(prefix, addr, sizeof(prefix)) == 0;
+}
+
+/**
+ * batadv_mcast_addr_is_ipv6() - check if multicast MAC is IPv6
+ * @addr: the MAC address to check
+ *
+ * Return: True, if MAC address is one reserved for IPv6 multicast, false
+ * otherwise.
+ */
+static bool batadv_mcast_addr_is_ipv6(const u8 *addr)
+{
+	static const u8 prefix[] = {0x33, 0x33};
+
+	return memcmp(prefix, addr, sizeof(prefix)) == 0;
+}
+
 /**
  * batadv_mcast_mla_softif_get() - get softif multicast listeners
+ * @bat_priv: the bat priv with all the soft interface information
  * @dev: the device to collect multicast addresses from
  * @mcast_list: a list to put found addresses into
  *
@@ -119,9 +148,12 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
  * Return: -ENOMEM on memory allocation error or the number of
  * items added to the mcast_list otherwise.
  */
-static int batadv_mcast_mla_softif_get(struct net_device *dev,
+static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv,
+				       struct net_device *dev,
 				       struct hlist_head *mcast_list)
 {
+	bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4;
+	bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6;
 	struct net_device *bridge = batadv_mcast_get_bridge(dev);
 	struct netdev_hw_addr *mc_list_entry;
 	struct batadv_hw_addr *new;
@@ -129,6 +161,12 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev,
 
 	netif_addr_lock_bh(bridge ? bridge : dev);
 	netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) {
+		if (all_ipv4 && batadv_mcast_addr_is_ipv4(mc_list_entry->addr))
+			continue;
+
+		if (all_ipv6 && batadv_mcast_addr_is_ipv6(mc_list_entry->addr))
+			continue;
+
 		new = kmalloc(sizeof(*new), GFP_ATOMIC);
 		if (!new) {
 			ret = -ENOMEM;
@@ -193,6 +231,7 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
 
 /**
  * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners
+ * @bat_priv: the bat priv with all the soft interface information
  * @dev: a bridge slave whose bridge to collect multicast addresses from
  * @mcast_list: a list to put found addresses into
  *
@@ -204,10 +243,13 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
  * Return: -ENOMEM on memory allocation error or the number of
  * items added to the mcast_list otherwise.
  */
-static int batadv_mcast_mla_bridge_get(struct net_device *dev,
+static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv,
+				       struct net_device *dev,
 				       struct hlist_head *mcast_list)
 {
 	struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list);
+	bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4;
+	bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6;
 	struct br_ip_list *br_ip_entry, *tmp;
 	struct batadv_hw_addr *new;
 	u8 mcast_addr[ETH_ALEN];
@@ -221,6 +263,12 @@ static int batadv_mcast_mla_bridge_get(struct net_device *dev,
 		goto out;
 
 	list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) {
+		if (all_ipv4 && br_ip_entry->addr.proto == htons(ETH_P_IP))
+			continue;
+
+		if (all_ipv6 && br_ip_entry->addr.proto == htons(ETH_P_IPV6))
+			continue;
+
 		batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr);
 		if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
 			continue;
@@ -568,11 +616,11 @@ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv)
 	if (!batadv_mcast_mla_tvlv_update(bat_priv))
 		goto update;
 
-	ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list);
+	ret = batadv_mcast_mla_softif_get(bat_priv, soft_iface, &mcast_list);
 	if (ret < 0)
 		goto out;
 
-	ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list);
+	ret = batadv_mcast_mla_bridge_get(bat_priv, soft_iface, &mcast_list);
 	if (ret < 0)
 		goto out;
 
-- 
cgit v1.2.3


From 77a5196a804e34ce5e215ef84d5e1de332e9c529 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Thu, 1 Mar 2018 13:49:57 -0800
Subject: gre: add sequence number for collect md mode.

Currently GRE sequence number can only be used in native
tunnel mode.  This patch adds sequence number support for
gre collect metadata mode.  RFC2890 defines GRE sequence
number to be specific to the traffic flow identified by the
key.  However, this patch does not implement per-key seqno.
The sequence number is shared in the same tunnel device.
That is, different tunnel keys using the same collect_md
tunnel share single sequence number.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c  |  4 +++-
 net/ipv4/ip_gre.c  |  7 +++++--
 net/ipv6/ip6_gre.c | 13 ++++++++-----
 3 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 0c121adbdbaa..33edfa8372fd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2991,7 +2991,7 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 	struct ip_tunnel_info *info;
 
 	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
-			       BPF_F_DONT_FRAGMENT)))
+			       BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
@@ -3025,6 +3025,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
 	if (flags & BPF_F_ZERO_CSUM_TX)
 		info->key.tun_flags &= ~TUNNEL_CSUM;
+	if (flags & BPF_F_SEQ_NUMBER)
+		info->key.tun_flags |= TUNNEL_SEQ;
 
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 0fe1d69b5df4..95fd225f402e 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -522,6 +522,7 @@ err_free_skb:
 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 			__be16 proto)
 {
+	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
 	struct rtable *rt = NULL;
@@ -545,9 +546,11 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
 		goto err_free_rt;
 
-	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+	flags = tun_info->key.tun_flags &
+		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 	gre_build_header(skb, tunnel_hlen, flags, proto,
-			 tunnel_id_to_key32(tun_info->key.tun_id), 0);
+			 tunnel_id_to_key32(tun_info->key.tun_id),
+			 (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 83c7766c8c75..18a3dfbd0300 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -695,9 +695,6 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 	else
 		fl6->daddr = tunnel->parms.raddr;
 
-	if (tunnel->parms.o_flags & TUNNEL_SEQ)
-		tunnel->o_seqno++;
-
 	/* Push GRE header. */
 	protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
 
@@ -720,14 +717,20 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
 
 		dsfield = key->tos;
-		flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+		flags = key->tun_flags &
+			(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 		tunnel->tun_hlen = gre_calc_hlen(flags);
 
 		gre_build_header(skb, tunnel->tun_hlen,
 				 flags, protocol,
-				 tunnel_id_to_key32(tun_info->key.tun_id), 0);
+				 tunnel_id_to_key32(tun_info->key.tun_id),
+				 (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
+						      : 0);
 
 	} else {
+		if (tunnel->parms.o_flags & TUNNEL_SEQ)
+			tunnel->o_seqno++;
+
 		gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
 				 protocol, tunnel->parms.o_key,
 				 htonl(tunnel->o_seqno));
-- 
cgit v1.2.3


From d143b9e3055358e160d8d5975d889ce5c149625a Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Fri, 2 Mar 2018 20:52:01 -0500
Subject: net sched actions: corrected extack message

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 1f65d6ada9ff..a54fa7b8c217 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1083,7 +1083,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
 
 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags,
 			 RTM_NEWACTION, 0, 0) <= 0) {
-		NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while deleting TC action");
+		NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
 		kfree_skb(skb);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From e6c6a92905210484a84a0cae5b013570b7a67b6f Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Sun, 4 Mar 2018 14:12:04 +0200
Subject: net: Make RX-FCS and LRO mutually exclusive

LRO and RX-FCS offloads cannot be enabled at the same time since it is
not clear what should happen to the FCS of each coalesced packet.
The FCS is not really part of the TCP payload, hence cannot be merged
into one big packet. On the other hand, providing one big LRO packet
with one FCS contradicts the RX-FCS feature goal.

Use the fix features mechanism in order to prevent intersection of the
features and drop LRO in case RX-FCS is requested.

Enabling RX-FCS while LRO is enabled will result in:
$ ethtool -K ens6 rx-fcs on
Actual changes:
large-receive-offload: off [requested on]
rx-fcs: on

Signed-off-by: Gal Pressman <galp@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 40fb3aed5df2..8b51f923ce99 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7542,6 +7542,12 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		}
 	}
 
+	/* LRO feature cannot be combined with RX-FCS */
+	if ((features & NETIF_F_LRO) && (features & NETIF_F_RXFCS)) {
+		netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
+		features &= ~NETIF_F_LRO;
+	}
+
 	return features;
 }
 
-- 
cgit v1.2.3


From 87ecc95d81d951b0984f2eb9c5c118cb68d0dce8 Mon Sep 17 00:00:00 2001
From: Priyaranjan Jha <priyarjha@google.com>
Date: Sun, 4 Mar 2018 10:38:35 -0800
Subject: tcp: add send queue size stat in SCM_TIMESTAMPING_OPT_STATS

This patch adds TCP_NLA_SENDQ_SIZE stat into SCM_TIMESTAMPING_OPT_STATS.
It reports no. of bytes present in send queue, when timestamp is
generated.

Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a33539798bf6..162ba4227446 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3031,7 +3031,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	u32 rate;
 
 	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
-			  3 * nla_total_size(sizeof(u32)) +
+			  4 * nla_total_size(sizeof(u32)) +
 			  2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
@@ -3061,6 +3061,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
+
+	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	return stats;
 }
 
-- 
cgit v1.2.3


From be631892948060f44b1ceee3132be1266932071e Mon Sep 17 00:00:00 2001
From: Priyaranjan Jha <priyarjha@google.com>
Date: Sun, 4 Mar 2018 10:38:36 -0800
Subject: tcp: add ca_state stat in SCM_TIMESTAMPING_OPT_STATS

This patch adds TCP_NLA_CA_STATE stat into SCM_TIMESTAMPING_OPT_STATS.
It reports ca_state of socket, when timestamp is generated.

Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 162ba4227446..fb350f740f69 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3032,7 +3032,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 
 	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
 			  4 * nla_total_size(sizeof(u32)) +
-			  2 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
+			  3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
 
@@ -3063,6 +3063,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
 
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
+	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
 	return stats;
 }
 
-- 
cgit v1.2.3


From 955dc68cb9b23b42999cafe6df3684309bc686c6 Mon Sep 17 00:00:00 2001
From: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Date: Mon, 5 Mar 2018 11:39:05 +1100
Subject: net/ncsi: Add generic netlink family

Add a generic netlink family for NCSI. This supports three commands;
NCSI_CMD_PKG_INFO which returns information on packages and their
associated channels, NCSI_CMD_SET_INTERFACE which allows a specific
package or package/channel combination to be set as the preferred
choice, and NCSI_CMD_CLEAR_INTERFACE which clears any preferred setting.

Signed-off-by: Samuel Mendoza-Jonas <sam@mendozajonas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/Makefile       |   2 +-
 net/ncsi/internal.h     |   3 +
 net/ncsi/ncsi-manage.c  |  30 +++-
 net/ncsi/ncsi-netlink.c | 421 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ncsi/ncsi-netlink.h |  20 +++
 5 files changed, 471 insertions(+), 5 deletions(-)
 create mode 100644 net/ncsi/ncsi-netlink.c
 create mode 100644 net/ncsi/ncsi-netlink.h

(limited to 'net')

diff --git a/net/ncsi/Makefile b/net/ncsi/Makefile
index dd12b564f2e7..436ef68331f2 100644
--- a/net/ncsi/Makefile
+++ b/net/ncsi/Makefile
@@ -1,4 +1,4 @@
 #
 # Makefile for NCSI API
 #
-obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o
+obj-$(CONFIG_NET_NCSI) += ncsi-cmd.o ncsi-rsp.o ncsi-aen.o ncsi-manage.o ncsi-netlink.o
diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h
index d30f7bd741d0..8da84312cd3b 100644
--- a/net/ncsi/internal.h
+++ b/net/ncsi/internal.h
@@ -276,6 +276,8 @@ struct ncsi_dev_priv {
 	unsigned int        package_num;     /* Number of packages         */
 	struct list_head    packages;        /* List of packages           */
 	struct ncsi_channel *hot_channel;    /* Channel was ever active    */
+	struct ncsi_package *force_package;  /* Force a specific package   */
+	struct ncsi_channel *force_channel;  /* Force a specific channel   */
 	struct ncsi_request requests[256];   /* Request table              */
 	unsigned int        request_id;      /* Last used request ID       */
 #define NCSI_REQ_START_IDX	1
@@ -318,6 +320,7 @@ extern spinlock_t ncsi_dev_lock;
 	list_for_each_entry_rcu(nc, &np->channels, node)
 
 /* Resources */
+u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index);
 int ncsi_find_filter(struct ncsi_channel *nc, int table, void *data);
 int ncsi_add_filter(struct ncsi_channel *nc, int table, void *data);
 int ncsi_remove_filter(struct ncsi_channel *nc, int table, int index);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index c989211bbabc..c3695ba0cf94 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -12,7 +12,6 @@
 #include <linux/init.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
-#include <linux/netlink.h>
 
 #include <net/ncsi.h>
 #include <net/net_namespace.h>
@@ -23,6 +22,7 @@
 
 #include "internal.h"
 #include "ncsi-pkt.h"
+#include "ncsi-netlink.h"
 
 LIST_HEAD(ncsi_dev_list);
 DEFINE_SPINLOCK(ncsi_dev_lock);
@@ -38,7 +38,7 @@ static inline int ncsi_filter_size(int table)
 	return sizes[table];
 }
 
-static u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index)
+u32 *ncsi_get_filter(struct ncsi_channel *nc, int table, int index)
 {
 	struct ncsi_channel_filter *ncf;
 	int size;
@@ -965,20 +965,37 @@ error:
 
 static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp)
 {
-	struct ncsi_package *np;
-	struct ncsi_channel *nc, *found, *hot_nc;
+	struct ncsi_package *np, *force_package;
+	struct ncsi_channel *nc, *found, *hot_nc, *force_channel;
 	struct ncsi_channel_mode *ncm;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ndp->lock, flags);
 	hot_nc = ndp->hot_channel;
+	force_channel = ndp->force_channel;
+	force_package = ndp->force_package;
 	spin_unlock_irqrestore(&ndp->lock, flags);
 
+	/* Force a specific channel whether or not it has link if we have been
+	 * configured to do so
+	 */
+	if (force_package && force_channel) {
+		found = force_channel;
+		ncm = &found->modes[NCSI_MODE_LINK];
+		if (!(ncm->data[2] & 0x1))
+			netdev_info(ndp->ndev.dev,
+				    "NCSI: Channel %u forced, but it is link down\n",
+				    found->id);
+		goto out;
+	}
+
 	/* The search is done once an inactive channel with up
 	 * link is found.
 	 */
 	found = NULL;
 	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		if (ndp->force_package && np != ndp->force_package)
+			continue;
 		NCSI_FOR_EACH_CHANNEL(np, nc) {
 			spin_lock_irqsave(&nc->lock, flags);
 
@@ -1594,6 +1611,9 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	ndp->ptype.dev = dev;
 	dev_add_pack(&ndp->ptype);
 
+	/* Set up generic netlink interface */
+	ncsi_init_netlink(dev);
+
 	return nd;
 }
 EXPORT_SYMBOL_GPL(ncsi_register_dev);
@@ -1673,6 +1693,8 @@ void ncsi_unregister_dev(struct ncsi_dev *nd)
 #endif
 	spin_unlock_irqrestore(&ncsi_dev_lock, flags);
 
+	ncsi_unregister_netlink(nd->dev);
+
 	kfree(ndp);
 }
 EXPORT_SYMBOL_GPL(ncsi_unregister_dev);
diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
new file mode 100644
index 000000000000..d4201665a580
--- /dev/null
+++ b/net/ncsi/ncsi-netlink.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <net/genetlink.h>
+#include <net/ncsi.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <uapi/linux/ncsi.h>
+
+#include "internal.h"
+#include "ncsi-netlink.h"
+
+static struct genl_family ncsi_genl_family;
+
+static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = {
+	[NCSI_ATTR_IFINDEX] =		{ .type = NLA_U32 },
+	[NCSI_ATTR_PACKAGE_LIST] =	{ .type = NLA_NESTED },
+	[NCSI_ATTR_PACKAGE_ID] =	{ .type = NLA_U32 },
+	[NCSI_ATTR_CHANNEL_ID] =	{ .type = NLA_U32 },
+};
+
+static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex)
+{
+	struct ncsi_dev_priv *ndp;
+	struct net_device *dev;
+	struct ncsi_dev *nd;
+	struct ncsi_dev;
+
+	if (!net)
+		return NULL;
+
+	dev = dev_get_by_index(net, ifindex);
+	if (!dev) {
+		pr_err("NCSI netlink: No device for ifindex %u\n", ifindex);
+		return NULL;
+	}
+
+	nd = ncsi_find_dev(dev);
+	ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL;
+
+	dev_put(dev);
+	return ndp;
+}
+
+static int ncsi_write_channel_info(struct sk_buff *skb,
+				   struct ncsi_dev_priv *ndp,
+				   struct ncsi_channel *nc)
+{
+	struct nlattr *vid_nest;
+	struct ncsi_channel_filter *ncf;
+	struct ncsi_channel_mode *m;
+	u32 *data;
+	int i;
+
+	nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id);
+	m = &nc->modes[NCSI_MODE_LINK];
+	nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]);
+	if (nc->state == NCSI_CHANNEL_ACTIVE)
+		nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE);
+	if (ndp->force_channel == nc)
+		nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED);
+
+	nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.version);
+	nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.alpha2);
+	nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name);
+
+	vid_nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR_VLAN_LIST);
+	if (!vid_nest)
+		return -ENOMEM;
+	ncf = nc->filters[NCSI_FILTER_VLAN];
+	i = -1;
+	if (ncf) {
+		while ((i = find_next_bit((void *)&ncf->bitmap, ncf->total,
+					  i + 1)) < ncf->total) {
+			data = ncsi_get_filter(nc, NCSI_FILTER_VLAN, i);
+			/* Uninitialised channels will have 'zero' vlan ids */
+			if (!data || !*data)
+				continue;
+			nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID,
+				    *(u16 *)data);
+		}
+	}
+	nla_nest_end(skb, vid_nest);
+
+	return 0;
+}
+
+static int ncsi_write_package_info(struct sk_buff *skb,
+				   struct ncsi_dev_priv *ndp, unsigned int id)
+{
+	struct nlattr *pnest, *cnest, *nest;
+	struct ncsi_package *np;
+	struct ncsi_channel *nc;
+	bool found;
+	int rc;
+
+	if (id > ndp->package_num) {
+		netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id);
+		return -ENODEV;
+	}
+
+	found = false;
+	NCSI_FOR_EACH_PACKAGE(ndp, np) {
+		if (np->id != id)
+			continue;
+		pnest = nla_nest_start(skb, NCSI_PKG_ATTR);
+		if (!pnest)
+			return -ENOMEM;
+		nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id);
+		if (ndp->force_package == np)
+			nla_put_flag(skb, NCSI_PKG_ATTR_FORCED);
+		cnest = nla_nest_start(skb, NCSI_PKG_ATTR_CHANNEL_LIST);
+		if (!cnest) {
+			nla_nest_cancel(skb, pnest);
+			return -ENOMEM;
+		}
+		NCSI_FOR_EACH_CHANNEL(np, nc) {
+			nest = nla_nest_start(skb, NCSI_CHANNEL_ATTR);
+			if (!nest) {
+				nla_nest_cancel(skb, cnest);
+				nla_nest_cancel(skb, pnest);
+				return -ENOMEM;
+			}
+			rc = ncsi_write_channel_info(skb, ndp, nc);
+			if (rc) {
+				nla_nest_cancel(skb, nest);
+				nla_nest_cancel(skb, cnest);
+				nla_nest_cancel(skb, pnest);
+				return rc;
+			}
+			nla_nest_end(skb, nest);
+		}
+		nla_nest_end(skb, cnest);
+		nla_nest_end(skb, pnest);
+		found = true;
+	}
+
+	if (!found)
+		return -ENODEV;
+
+	return 0;
+}
+
+static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info)
+{
+	struct ncsi_dev_priv *ndp;
+	unsigned int package_id;
+	struct sk_buff *skb;
+	struct nlattr *attr;
+	void *hdr;
+	int rc;
+
+	if (!info || !info->attrs)
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_IFINDEX])
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_PACKAGE_ID])
+		return -EINVAL;
+
+	ndp = ndp_from_ifindex(genl_info_net(info),
+			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+	if (!ndp)
+		return -ENODEV;
+
+	skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+			  &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO);
+	if (!hdr) {
+		kfree(skb);
+		return -EMSGSIZE;
+	}
+
+	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
+
+	attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST);
+	rc = ncsi_write_package_info(skb, ndp, package_id);
+
+	if (rc) {
+		nla_nest_cancel(skb, attr);
+		goto err;
+	}
+
+	nla_nest_end(skb, attr);
+
+	genlmsg_end(skb, hdr);
+	return genlmsg_reply(skb, info);
+
+err:
+	genlmsg_cancel(skb, hdr);
+	kfree(skb);
+	return rc;
+}
+
+static int ncsi_pkg_info_all_nl(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	struct nlattr *attrs[NCSI_ATTR_MAX];
+	struct ncsi_package *np, *package;
+	struct ncsi_dev_priv *ndp;
+	unsigned int package_id;
+	struct nlattr *attr;
+	void *hdr;
+	int rc;
+
+	rc = genlmsg_parse(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX,
+			   ncsi_genl_policy, NULL);
+	if (rc)
+		return rc;
+
+	if (!attrs[NCSI_ATTR_IFINDEX])
+		return -EINVAL;
+
+	ndp = ndp_from_ifindex(get_net(sock_net(skb->sk)),
+			       nla_get_u32(attrs[NCSI_ATTR_IFINDEX]));
+
+	if (!ndp)
+		return -ENODEV;
+
+	package_id = cb->args[0];
+	package = NULL;
+	NCSI_FOR_EACH_PACKAGE(ndp, np)
+		if (np->id == package_id)
+			package = np;
+
+	if (!package)
+		return 0; /* done */
+
+	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+			  &ncsi_genl_family, 0,  NCSI_CMD_PKG_INFO);
+	if (!hdr) {
+		rc = -EMSGSIZE;
+		goto err;
+	}
+
+	attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST);
+	rc = ncsi_write_package_info(skb, ndp, package->id);
+	if (rc) {
+		nla_nest_cancel(skb, attr);
+		goto err;
+	}
+
+	nla_nest_end(skb, attr);
+	genlmsg_end(skb, hdr);
+
+	cb->args[0] = package_id + 1;
+
+	return skb->len;
+err:
+	genlmsg_cancel(skb, hdr);
+	return rc;
+}
+
+static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
+{
+	struct ncsi_package *np, *package;
+	struct ncsi_channel *nc, *channel;
+	u32 package_id, channel_id;
+	struct ncsi_dev_priv *ndp;
+	unsigned long flags;
+
+	if (!info || !info->attrs)
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_IFINDEX])
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_PACKAGE_ID])
+		return -EINVAL;
+
+	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+	if (!ndp)
+		return -ENODEV;
+
+	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
+	package = NULL;
+
+	spin_lock_irqsave(&ndp->lock, flags);
+
+	NCSI_FOR_EACH_PACKAGE(ndp, np)
+		if (np->id == package_id)
+			package = np;
+	if (!package) {
+		/* The user has set a package that does not exist */
+		return -ERANGE;
+	}
+
+	channel = NULL;
+	if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) {
+		/* Allow any channel */
+		channel_id = NCSI_RESERVED_CHANNEL;
+	} else {
+		channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]);
+		NCSI_FOR_EACH_CHANNEL(package, nc)
+			if (nc->id == channel_id)
+				channel = nc;
+	}
+
+	if (channel_id != NCSI_RESERVED_CHANNEL && !channel) {
+		/* The user has set a channel that does not exist on this
+		 * package
+		 */
+		netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n",
+			    channel_id);
+		return -ERANGE;
+	}
+
+	ndp->force_package = package;
+	ndp->force_channel = channel;
+	spin_unlock_irqrestore(&ndp->lock, flags);
+
+	netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x%s as preferred\n",
+		    package_id, channel_id,
+		    channel_id == NCSI_RESERVED_CHANNEL ? " (any)" : "");
+
+	/* Bounce the NCSI channel to set changes */
+	ncsi_stop_dev(&ndp->ndev);
+	ncsi_start_dev(&ndp->ndev);
+
+	return 0;
+}
+
+static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info)
+{
+	struct ncsi_dev_priv *ndp;
+	unsigned long flags;
+
+	if (!info || !info->attrs)
+		return -EINVAL;
+
+	if (!info->attrs[NCSI_ATTR_IFINDEX])
+		return -EINVAL;
+
+	ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)),
+			       nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX]));
+	if (!ndp)
+		return -ENODEV;
+
+	/* Clear any override */
+	spin_lock_irqsave(&ndp->lock, flags);
+	ndp->force_package = NULL;
+	ndp->force_channel = NULL;
+	spin_unlock_irqrestore(&ndp->lock, flags);
+	netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n");
+
+	/* Bounce the NCSI channel to set changes */
+	ncsi_stop_dev(&ndp->ndev);
+	ncsi_start_dev(&ndp->ndev);
+
+	return 0;
+}
+
+static const struct genl_ops ncsi_ops[] = {
+	{
+		.cmd = NCSI_CMD_PKG_INFO,
+		.policy = ncsi_genl_policy,
+		.doit = ncsi_pkg_info_nl,
+		.dumpit = ncsi_pkg_info_all_nl,
+		.flags = 0,
+	},
+	{
+		.cmd = NCSI_CMD_SET_INTERFACE,
+		.policy = ncsi_genl_policy,
+		.doit = ncsi_set_interface_nl,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = NCSI_CMD_CLEAR_INTERFACE,
+		.policy = ncsi_genl_policy,
+		.doit = ncsi_clear_interface_nl,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+static struct genl_family ncsi_genl_family __ro_after_init = {
+	.name = "NCSI",
+	.version = 0,
+	.maxattr = NCSI_ATTR_MAX,
+	.module = THIS_MODULE,
+	.ops = ncsi_ops,
+	.n_ops = ARRAY_SIZE(ncsi_ops),
+};
+
+int ncsi_init_netlink(struct net_device *dev)
+{
+	int rc;
+
+	rc = genl_register_family(&ncsi_genl_family);
+	if (rc)
+		netdev_err(dev, "ncsi: failed to register netlink family\n");
+
+	return rc;
+}
+
+int ncsi_unregister_netlink(struct net_device *dev)
+{
+	int rc;
+
+	rc = genl_unregister_family(&ncsi_genl_family);
+	if (rc)
+		netdev_err(dev, "ncsi: failed to unregister netlink family\n");
+
+	return rc;
+}
diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h
new file mode 100644
index 000000000000..91a5c256f8c4
--- /dev/null
+++ b/net/ncsi/ncsi-netlink.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NCSI_NETLINK_H__
+#define __NCSI_NETLINK_H__
+
+#include <linux/netdevice.h>
+
+#include "internal.h"
+
+int ncsi_init_netlink(struct net_device *dev);
+int ncsi_unregister_netlink(struct net_device *dev);
+
+#endif /* __NCSI_NETLINK_H__ */
-- 
cgit v1.2.3


From ec012f3b85159102662264bb5c3436c849d2b1b9 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:30:41 +0300
Subject: net: Convert broute_net_ops, frame_filter_net_ops and
 frame_nat_net_ops

These pernet_operations use ebt_register_table() and
ebt_unregister_table() to act on the tables, which
are used as argument in ebt_do_table(), called from
ebtables hooks.

Since there are no net-related bridge packets in-flight,
when the init and exit methods are called, these
pernet_operations are safe to be executed in parallel
with any other.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/ebtable_broute.c | 1 +
 net/bridge/netfilter/ebtable_filter.c | 1 +
 net/bridge/netfilter/ebtable_nat.c    | 1 +
 3 files changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 276b60262981..f070b5e5b9dd 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -77,6 +77,7 @@ static void __net_exit broute_net_exit(struct net *net)
 static struct pernet_operations broute_net_ops = {
 	.init = broute_net_init,
 	.exit = broute_net_exit,
+	.async = true,
 };
 
 static int __init ebtable_broute_init(void)
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index c41da5fac84f..4151afc8efcc 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -105,6 +105,7 @@ static void __net_exit frame_filter_net_exit(struct net *net)
 static struct pernet_operations frame_filter_net_ops = {
 	.init = frame_filter_net_init,
 	.exit = frame_filter_net_exit,
+	.async = true,
 };
 
 static int __init ebtable_filter_init(void)
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 08df7406ecb3..b8da2dfe2ec5 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -105,6 +105,7 @@ static void __net_exit frame_nat_net_exit(struct net *net)
 static struct pernet_operations frame_nat_net_ops = {
 	.init = frame_nat_net_init,
 	.exit = frame_nat_net_exit,
+	.async = true,
 };
 
 static int __init ebtable_nat_init(void)
-- 
cgit v1.2.3


From 3822034569ac8ae39556e50fa165dc7d4276f452 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:30:50 +0300
Subject: net: Convert log pernet_operations

These pernet_operations use nf_log_set() and nf_log_unset()
in their methods:

	nf_log_bridge_net_ops
	nf_log_arp_net_ops
	nf_log_ipv4_net_ops
	nf_log_ipv6_net_ops
	nf_log_netdev_net_ops

Nobody can send such a packet to a net before it's became
registered, nobody can send a packet after all netdevices
are unregistered. So, these pernet_operations are able
to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/netfilter/nf_log_bridge.c | 1 +
 net/ipv4/netfilter/nf_log_arp.c      | 1 +
 net/ipv4/netfilter/nf_log_ipv4.c     | 1 +
 net/ipv6/netfilter/nf_log_ipv6.c     | 1 +
 net/netfilter/nf_log_netdev.c        | 1 +
 5 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
index bd2b3c78f59b..91bfc2ac055a 100644
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -48,6 +48,7 @@ static void __net_exit nf_log_bridge_net_exit(struct net *net)
 static struct pernet_operations nf_log_bridge_net_ops = {
 	.init = nf_log_bridge_net_init,
 	.exit = nf_log_bridge_net_exit,
+	.async = true,
 };
 
 static int __init nf_log_bridge_init(void)
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index df5c2a2061a4..162293469ac2 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -122,6 +122,7 @@ static void __net_exit nf_log_arp_net_exit(struct net *net)
 static struct pernet_operations nf_log_arp_net_ops = {
 	.init = nf_log_arp_net_init,
 	.exit = nf_log_arp_net_exit,
+	.async = true,
 };
 
 static int __init nf_log_arp_init(void)
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 4388de0e5380..7a06de140f3c 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -358,6 +358,7 @@ static void __net_exit nf_log_ipv4_net_exit(struct net *net)
 static struct pernet_operations nf_log_ipv4_net_ops = {
 	.init = nf_log_ipv4_net_init,
 	.exit = nf_log_ipv4_net_exit,
+	.async = true,
 };
 
 static int __init nf_log_ipv4_init(void)
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index b397a8fe88b9..0220e584589c 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -390,6 +390,7 @@ static void __net_exit nf_log_ipv6_net_exit(struct net *net)
 static struct pernet_operations nf_log_ipv6_net_ops = {
 	.init = nf_log_ipv6_net_init,
 	.exit = nf_log_ipv6_net_exit,
+	.async = true,
 };
 
 static int __init nf_log_ipv6_init(void)
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
index 350eb147754d..254c2c6bde48 100644
--- a/net/netfilter/nf_log_netdev.c
+++ b/net/netfilter/nf_log_netdev.c
@@ -47,6 +47,7 @@ static void __net_exit nf_log_netdev_net_exit(struct net *net)
 static struct pernet_operations nf_log_netdev_net_ops = {
 	.init = nf_log_netdev_net_init,
 	.exit = nf_log_netdev_net_exit,
+	.async = true,
 };
 
 static int __init nf_log_netdev_init(void)
-- 
cgit v1.2.3


From c60a246cd366b5da5a12cc972a25e41bc536706f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:31:00 +0300
Subject: net: Convert arp_tables_net_ops and ip6_tables_net_ops

These pernet_operations call xt_proto_init() and xt_proto_fini(),
which just register and unregister /proc entries.
They are safe to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/arp_tables.c | 1 +
 net/ipv6/netfilter/ip6_tables.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e3e420f3ba7b..c36ffce3c812 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1635,6 +1635,7 @@ static void __net_exit arp_tables_net_exit(struct net *net)
 static struct pernet_operations arp_tables_net_ops = {
 	.init = arp_tables_net_init,
 	.exit = arp_tables_net_exit,
+	.async = true,
 };
 
 static int __init arp_tables_init(void)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 62358b93bbac..4de8ac1e5af4 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1928,6 +1928,7 @@ static void __net_exit ip6_tables_net_exit(struct net *net)
 static struct pernet_operations ip6_tables_net_ops = {
 	.init = ip6_tables_net_init,
 	.exit = ip6_tables_net_exit,
+	.async = true,
 };
 
 static int __init ip6_tables_init(void)
-- 
cgit v1.2.3


From d217472410a033cdef5b8fba4393b466ee3c3bbc Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:31:10 +0300
Subject: net: Convert caif_net_ops

Init method just allocates memory for new cfg, and
assigns net_generic(net, caif_net_id). Despite there is
synchronize_rcu() on error path in cfcnfg_create(),
in real this function does not use global lists,
so it looks like this synchronize_rcu() is some legacy
inheritance. Exit method removes caif devices under
rtnl_lock().

There could be a problem, if someone from foreign net
pernet_operations dereference caif_net_id of this net.
It's dereferenced in get_cfcnfg() and caif_device_list().

get_cfcnfg() is used from netdevice notifiers, where
they are called under rtnl_lock(). The notifiers can't
be called from foreign nets pernet_operations. Also,
it's used from caif_disconnect_client() and from
caif_connect_client(). The both of the functions work
with caif socket, and there is the only possibility
to have a socket, when the net is dead. This may happen
only of the socket was created as kern using sk_alloc().
Grep by PF_CAIF shows we do not create kern caif sockets,
so get_cfcnfg() is safe.

caif_device_list() is used in netdevice notifiers and exit
method under rtnl lock. Also, from caif_get() used in
the netdev notifiers and in caif_flow_cb(). The last item
is skb destructor. Since there are no kernel caif sockets
nobody can send net a packet in parallel with init/exit,
so this is also safe.

So, these pernet_operations are safe to be async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/caif/caif_dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index e0adcd123f48..7a78268cc572 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -544,6 +544,7 @@ static struct pernet_operations caif_net_ops = {
 	.exit = caif_exit_net,
 	.id   = &caif_net_id,
 	.size = sizeof(struct caif_net),
+	.async = true,
 };
 
 /* Initialize Caif devices list */
-- 
cgit v1.2.3


From 111da7adc1276d44d20c3973f4b39d62f83df056 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:31:19 +0300
Subject: net: Convert cangw_pernet_ops

These pernet_operations have a deal with cgw_list,
and the rest of accesses are made under rtnl_lock().
The only exception is cgw_dump_jobs(), which is
accessed under rcu_read_lock(). cgw_dump_jobs() is
called on netlink request, and it does not seem,
foreign pernet_operations want to send a net such
the messages. So, we mark them as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/can/gw.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/can/gw.c b/net/can/gw.c
index 398dd0395ad9..08e97668d5cf 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1010,6 +1010,7 @@ static void __net_exit cangw_pernet_exit(struct net *net)
 static struct pernet_operations cangw_pernet_ops = {
 	.init = cangw_pernet_init,
 	.exit = cangw_pernet_exit,
+	.async = true,
 };
 
 static __init int cgw_module_init(void)
-- 
cgit v1.2.3


From 5368bd72cd7aacd0e3f80f61e9b2a096fc81cc10 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:31:28 +0300
Subject: net: Convert dccp_v4_ops

These pernet_operations create and destroy net::dccp::v4_ctl_sk.
It looks like another pernet_operations don't want to send
dccp packets to dying or creating net. Batch method similar
to ipv4/ipv6 sockets and it has to be safe to be executed
in parallel with anything else. So, we mark them as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv4.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e65fcb45c3f6..13ad28ab1e79 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1031,6 +1031,7 @@ static struct pernet_operations dccp_v4_ops = {
 	.init	= dccp_v4_init_net,
 	.exit	= dccp_v4_exit_net,
 	.exit_batch = dccp_v4_exit_batch,
+	.async	= true,
 };
 
 static int __init dccp_v4_init(void)
-- 
cgit v1.2.3


From 16b0c0c4d9a799979b0d3f387821c93bcc301f79 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:31:37 +0300
Subject: net: Convert dccp_v6_ops

These pernet_operations looks similar to dccp_v4_ops,
and they are also safe to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/dccp/ipv6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 5df7857fc0f3..2f48c020f8c3 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1116,6 +1116,7 @@ static struct pernet_operations dccp_v6_ops = {
 	.init   = dccp_v6_init_net,
 	.exit   = dccp_v6_exit_net,
 	.exit_batch = dccp_v6_exit_batch,
+	.async	= true,
 };
 
 static int __init dccp_v6_init(void)
-- 
cgit v1.2.3


From 6c6c566e6dd80bec6f5bbb9c36e397ed9b109cdb Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:31:47 +0300
Subject: net: Convert fou_net_ops

These pernet_operations initialize and destroy
pernet net_generic(net, fou_net_id) list.
The rest of net_generic(net, fou_net_id) accesses
may happen after netlink message, and in-tree
pernet_operations do not send FOU_GENL_NAME messages.
So, these pernet_operations are safe to be marked
as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fou.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 1540db65241a..d3e1a9af478b 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -1081,6 +1081,7 @@ static struct pernet_operations fou_net_ops = {
 	.exit = fou_exit_net,
 	.id   = &fou_net_id,
 	.size = sizeof(struct fou_net),
+	.async = true,
 };
 
 static int __init fou_init(void)
-- 
cgit v1.2.3


From a5a179b6dff15ae0b89cb601b0a0242cff60b8e3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:31:55 +0300
Subject: net: Convert ip_set_net_ops

These pernet_operations initialize and destroy
net_generic(net, ip_set_net_id)-related data.
Since ip_set is under CONFIG_IP_SET, it's easy
to watch drivers, which depend on this config.
All of them are in net/netfilter/ipset directory,
except of net/netfilter/xt_set.c. There are no
more drivers, which use ip_set, and all of
the above don't register another pernet_operations.
Also, there are is no indirect users, as header
file include/linux/netfilter/ipset/ip_set.h does
not define indirect users by something like this:

	#ifdef CONFIG_IP_SET
	extern func(void);
	#else
	static inline func(void);
	#endif

So, there are no more pernet operations, dereferencing
net_generic(net, ip_set_net_id).

ip_set_net_ops are OK to be executed in parallel
for several net, so we mark them as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/ipset/ip_set_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 975a85a48d39..2523ebe2b3cc 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -2094,7 +2094,8 @@ static struct pernet_operations ip_set_net_ops = {
 	.init	= ip_set_net_init,
 	.exit   = ip_set_net_exit,
 	.id	= &ip_set_net_id,
-	.size	= sizeof(struct ip_set_net)
+	.size	= sizeof(struct ip_set_net),
+	.async	= true,
 };
 
 static int __init
-- 
cgit v1.2.3


From 467d14b3073960645902ea0d18c1cbe645013638 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:32:06 +0300
Subject: net: Convert nf_conntrack_net_ops

These pernet_operations register and unregister sysctl and /proc
entries. Exit batch method also waits till all per-net conntracks
are dead. Thus, they are safe to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_standalone.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 9123fdec5e14..3cdce391362e 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -705,6 +705,7 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
 static struct pernet_operations nf_conntrack_net_ops = {
 	.init		= nf_conntrack_pernet_init,
 	.exit_batch	= nf_conntrack_pernet_exit,
+	.async		= true,
 };
 
 static int __init nf_conntrack_standalone_init(void)
-- 
cgit v1.2.3


From b04a3d098c4ca176849ee579880c63c052ce6776 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:32:15 +0300
Subject: net: Convert ctnetlink_net_ops

These pernet_operations register and unregister
two conntrack notifiers, and they seem to be safe
to be executed in parallel.

General/not related to async pernet_operations JFI:
ctnetlink_net_exit_batch() actions are grouped in batch,
and this could look like there is synchronize_rcu()
is forgotten. But there is synchronize_rcu() on module
exit patch (in ctnetlink_exit()), so this batch may
be reworked as simple .exit method.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_netlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index dd177ebee9aa..8884d302d33a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3417,6 +3417,7 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
 static struct pernet_operations ctnetlink_net_ops = {
 	.init		= ctnetlink_net_init,
 	.exit_batch	= ctnetlink_net_exit_batch,
+	.async		= true,
 };
 
 static int __init ctnetlink_init(void)
-- 
cgit v1.2.3


From c29babb7fe26e1507e8c236abbfed6e944e42651 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 5 Mar 2018 14:32:23 +0300
Subject: net: Convert proto_gre_net_ops

These pernet_operations register and unregister sysctl.
nf_conntrack_l4proto_gre4->init_net is simple memory
initializer. Also, exit method removes gre keymap_list,
which is per-net. This looks safe to be executed
in parallel with other pernet_operations.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_proto_gre.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index d049ea5a3770..9bcd72fe91f9 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -406,6 +406,7 @@ static struct pernet_operations proto_gre_net_ops = {
 	.exit = proto_gre_net_exit,
 	.id   = &proto_gre_net_id,
 	.size = sizeof(struct netns_proto_gre),
+	.async = true,
 };
 
 static int __init nf_ct_proto_gre_init(void)
-- 
cgit v1.2.3


From 4c1342d967cb556ea1c0f34271b125deeb25f0f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= <j.neuschaefer@gmx.net>
Date: Sun, 4 Mar 2018 03:29:52 +0100
Subject: net: core: dst_cache_set_ip6: Rename 'addr' parameter to 'saddr' for
 consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The other dst_cache_{get,set}_ip{4,6} functions, and the doc comment for
dst_cache_set_ip6 use 'saddr' for their source address parameter. Rename
the parameter to increase consistency.

This fixes the following kernel-doc warnings:

./include/net/dst_cache.h:58: warning: Function parameter or member 'addr' not described in 'dst_cache_set_ip6'
./include/net/dst_cache.h:58: warning: Excess function parameter 'saddr' description in 'dst_cache_set_ip6'

Fixes: 911362c70df5 ("net: add dst_cache support")
Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dst_cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 554d36449231..64cef977484a 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -107,7 +107,7 @@ EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
 
 #if IS_ENABLED(CONFIG_IPV6)
 void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
-		       const struct in6_addr *addr)
+		       const struct in6_addr *saddr)
 {
 	struct dst_cache_pcpu *idst;
 
@@ -117,7 +117,7 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
 	idst = this_cpu_ptr(dst_cache->cache);
 	dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
 				  rt6_get_cookie((struct rt6_info *)dst));
-	idst->in6_saddr = *addr;
+	idst->in6_saddr = *saddr;
 }
 EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
 
-- 
cgit v1.2.3


From ae0662f84b105776734cb089703a7bf834bac195 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Sat, 20 Jan 2018 04:27:58 +0800
Subject: netfilter: nf_tables: nf_tables_obj_lookup_byhandle() can be static

Fixes: 3ecbfd65f50e ("netfilter: nf_tables: allocate handle and delete objects via handle")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8b9fe30de0cd..8cc7fc970f0c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4328,9 +4328,9 @@ struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
 }
 EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
 
-struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
-						 const struct nlattr *nla,
-						 u32 objtype, u8 genmask)
+static struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
+							const struct nlattr *nla,
+							u32 objtype, u8 genmask)
 {
 	struct nft_object *obj;
 
@@ -4850,7 +4850,7 @@ struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
 }
 EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
 
-struct nft_flowtable *
+static struct nft_flowtable *
 nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
 				    const struct nlattr *nla, u8 genmask)
 {
-- 
cgit v1.2.3


From cceae76ef3a1181242e4f7b559a7bfc904a9855c Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sun, 11 Feb 2018 19:17:20 +0900
Subject: netfilter: nfnetlink_acct: remove useless parameter

parameter skb in nfnl_acct_overquota is not used anywhere.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_acct.c | 3 +--
 net/netfilter/xt_nfacct.c      | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 88d427f9f9e6..b9505bcd3827 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -467,8 +467,7 @@ static void nfnl_overquota_report(struct net *net, struct nf_acct *nfacct)
 			  GFP_ATOMIC);
 }
 
-int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb,
-			struct nf_acct *nfacct)
+int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct)
 {
 	u64 now;
 	u64 *quota;
diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c
index c8674deed4eb..6b56f4170860 100644
--- a/net/netfilter/xt_nfacct.c
+++ b/net/netfilter/xt_nfacct.c
@@ -28,7 +28,7 @@ static bool nfacct_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 	nfnl_acct_update(skb, info->nfacct);
 
-	overquota = nfnl_acct_overquota(xt_net(par), skb, info->nfacct);
+	overquota = nfnl_acct_overquota(xt_net(par), info->nfacct);
 
 	return overquota == NFACCT_UNDERQUOTA ? false : true;
 }
-- 
cgit v1.2.3


From 580c7d9e4cc69802189b872ad2df0d704c649441 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sun, 11 Feb 2018 22:57:29 +0900
Subject: netfilter: xt_cluster: get rid of xt_cluster_ipv6_is_multicast

If use the ipv6_addr_is_multicast instead of xt_cluster_ipv6_is_multicast,
then we can reduce code size.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_cluster.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c
index 0068688995c8..dfbdbb2fc0ed 100644
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -59,13 +59,6 @@ xt_cluster_hash(const struct nf_conn *ct,
 	return reciprocal_scale(hash, info->total_nodes);
 }
 
-static inline bool
-xt_cluster_ipv6_is_multicast(const struct in6_addr *addr)
-{
-	__be32 st = addr->s6_addr32[0];
-	return ((st & htonl(0xFF000000)) == htonl(0xFF000000));
-}
-
 static inline bool
 xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
 {
@@ -76,8 +69,7 @@ xt_cluster_is_multicast_addr(const struct sk_buff *skb, u_int8_t family)
 		is_multicast = ipv4_is_multicast(ip_hdr(skb)->daddr);
 		break;
 	case NFPROTO_IPV6:
-		is_multicast =
-			xt_cluster_ipv6_is_multicast(&ipv6_hdr(skb)->daddr);
+		is_multicast = ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr);
 		break;
 	default:
 		WARN_ON(1);
-- 
cgit v1.2.3


From 433029ecc62788296cacca50ceb24db90c17a4a2 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Sun, 11 Feb 2018 23:28:18 +0900
Subject: netfilter: nf_conntrack_broadcast: remove useless parameter

parameter protoff in nf_conntrack_broadcast_help is not used anywhere.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_broadcast.c  | 1 -
 net/netfilter/nf_conntrack_netbios_ns.c | 5 +++--
 net/netfilter/nf_conntrack_snmp.c       | 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index ecc3ab784633..a1086bdec242 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -20,7 +20,6 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 
 int nf_conntrack_broadcast_help(struct sk_buff *skb,
-				unsigned int protoff,
 				struct nf_conn *ct,
 				enum ip_conntrack_info ctinfo,
 				unsigned int timeout)
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 496ce173f0c1..a4a59dc7cf17 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -41,9 +41,10 @@ static struct nf_conntrack_expect_policy exp_policy = {
 };
 
 static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
-		   struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+			   struct nf_conn *ct,
+			   enum ip_conntrack_info ctinfo)
 {
-	return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+	return nf_conntrack_broadcast_help(skb, ct, ctinfo, timeout);
 }
 
 static struct nf_conntrack_helper helper __read_mostly = {
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
index 87b95a2c270c..2d0f8e010821 100644
--- a/net/netfilter/nf_conntrack_snmp.c
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -36,11 +36,12 @@ int (*nf_nat_snmp_hook)(struct sk_buff *skb,
 EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
 
 static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
-		struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+			       struct nf_conn *ct,
+			       enum ip_conntrack_info ctinfo)
 {
 	typeof(nf_nat_snmp_hook) nf_nat_snmp;
 
-	nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+	nf_conntrack_broadcast_help(skb, ct, ctinfo, timeout);
 
 	nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
 	if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
-- 
cgit v1.2.3


From 2db3fec507bd822ed81c031221b97701238949dc Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Tue, 13 Feb 2018 08:25:57 -0600
Subject: netfilter: ipt_ah: return boolean instead of integer

Return statements in functions returning bool should use
true/false instead of 1/0.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_ah.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index a787d07f6cb7..7c6c20eaf4db 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -47,7 +47,7 @@ static bool ah_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		 */
 		pr_debug("Dropping evil AH tinygram.\n");
 		par->hotdrop = true;
-		return 0;
+		return false;
 	}
 
 	return spi_match(ahinfo->spis[0], ahinfo->spis[1],
-- 
cgit v1.2.3


From f31e5f1a891f989f107e8caa6b49dd4df0e12265 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 16 Feb 2018 18:04:56 +0800
Subject: netfilter: unlock xt_table earlier in __do_replace

Now it's doing cleanup_entry for oldinfo under the xt_table lock,
but it's not really necessary. After the replacement job is done
in xt_replace_table, oldinfo is not used elsewhere any more, and
it can be freed without xt_table lock safely.

The important thing is that rtnl_lock is called in some xt_target
destroy, which means rtnl_lock, a big lock is used in xt_table
lock, a smaller one. It usually could be the reason why a dead
lock may happen.

Besides, all xt_target/match checkentry is called out of xt_table
lock. It's better also to move all cleanup_entry calling out of
xt_table lock, just as do_replace_finish does for ebtables.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 3 ++-
 net/ipv4/netfilter/ip_tables.c  | 3 ++-
 net/ipv6/netfilter/ip6_tables.c | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c36ffce3c812..a0c7ce76879c 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -925,6 +925,8 @@ static int __do_replace(struct net *net, const char *name,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
+	xt_table_unlock(t);
+
 	get_old_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
@@ -939,7 +941,6 @@ static int __do_replace(struct net *net, const char *name,
 		net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
 	}
 	vfree(counters);
-	xt_table_unlock(t);
 	return ret;
 
  put_module:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d4f7584d2dbe..4f7153e25e0b 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1087,6 +1087,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
+	xt_table_unlock(t);
+
 	get_old_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
@@ -1100,7 +1102,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 		net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
 	}
 	vfree(counters);
-	xt_table_unlock(t);
 	return ret;
 
  put_module:
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 4de8ac1e5af4..6c44033decab 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1105,6 +1105,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	    (newinfo->number <= oldinfo->initial_entries))
 		module_put(t->me);
 
+	xt_table_unlock(t);
+
 	get_old_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
@@ -1118,7 +1120,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 		net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
 	}
 	vfree(counters);
-	xt_table_unlock(t);
 	return ret;
 
  put_module:
-- 
cgit v1.2.3


From 07a9da51b4b6aece8bc71e0b1b601fc4c3eb8b64 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:27 +0100
Subject: netfilter: x_tables: check standard verdicts in core

Userspace must provide a valid verdict to the standard target.

The verdict can be either a jump (signed int > 0), or a return code.

Allowed return codes are either RETURN (pop from stack), NF_ACCEPT, DROP
and QUEUE (latter is allowed for legacy reasons).

Jump offsets (verdict > 0) are checked in more detail later on when
loop-detection is performed.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c |  5 -----
 net/ipv4/netfilter/ip_tables.c  |  5 -----
 net/ipv6/netfilter/ip6_tables.c |  5 -----
 net/netfilter/x_tables.c        | 49 ++++++++++++++++++++++++++++++++++++-----
 4 files changed, 43 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index a0c7ce76879c..c9ffa884a4ee 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -334,11 +334,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 			     t->verdict < 0) || visited) {
 				unsigned int oldpos, size;
 
-				if ((strcmp(t->target.u.user.name,
-					    XT_STANDARD_TARGET) == 0) &&
-				    t->verdict < -NF_MAX_VERDICT - 1)
-					return 0;
-
 				/* Return: backtrack through the last
 				 * big jump.
 				 */
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4f7153e25e0b..c9b57a6bf96a 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -402,11 +402,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			     t->verdict < 0) || visited) {
 				unsigned int oldpos, size;
 
-				if ((strcmp(t->target.u.user.name,
-					    XT_STANDARD_TARGET) == 0) &&
-				    t->verdict < -NF_MAX_VERDICT - 1)
-					return 0;
-
 				/* Return: backtrack through the last
 				   big jump. */
 				do {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6c44033decab..f46954221933 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -420,11 +420,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			     t->verdict < 0) || visited) {
 				unsigned int oldpos, size;
 
-				if ((strcmp(t->target.u.user.name,
-					    XT_STANDARD_TARGET) == 0) &&
-				    t->verdict < -NF_MAX_VERDICT - 1)
-					return 0;
-
 				/* Return: backtrack through the last
 				   big jump. */
 				do {
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d9deebe599ec..2e4d423e58e6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -654,6 +654,31 @@ struct compat_xt_standard_target {
 	compat_uint_t verdict;
 };
 
+static bool verdict_ok(int verdict)
+{
+	if (verdict > 0)
+		return true;
+
+	if (verdict < 0) {
+		int v = -verdict - 1;
+
+		if (verdict == XT_RETURN)
+			return true;
+
+		switch (v) {
+		case NF_ACCEPT: return true;
+		case NF_DROP: return true;
+		case NF_QUEUE: return true;
+		default:
+			break;
+		}
+
+		return false;
+	}
+
+	return false;
+}
+
 int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
@@ -675,9 +700,15 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 	if (target_offset + t->u.target_size > next_offset)
 		return -EINVAL;
 
-	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
-	    COMPAT_XT_ALIGN(target_offset + sizeof(struct compat_xt_standard_target)) != next_offset)
-		return -EINVAL;
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) {
+		const struct compat_xt_standard_target *st = (const void *)t;
+
+		if (COMPAT_XT_ALIGN(target_offset + sizeof(*st)) != next_offset)
+			return -EINVAL;
+
+		if (!verdict_ok(st->verdict))
+			return -EINVAL;
+	}
 
 	/* compat_xt_entry match has less strict alignment requirements,
 	 * otherwise they are identical.  In case of padding differences
@@ -757,9 +788,15 @@ int xt_check_entry_offsets(const void *base,
 	if (target_offset + t->u.target_size > next_offset)
 		return -EINVAL;
 
-	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0 &&
-	    XT_ALIGN(target_offset + sizeof(struct xt_standard_target)) != next_offset)
-		return -EINVAL;
+	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) == 0) {
+		const struct xt_standard_target *st = (const void *)t;
+
+		if (XT_ALIGN(target_offset + sizeof(*st)) != next_offset)
+			return -EINVAL;
+
+		if (!verdict_ok(st->verdict))
+			return -EINVAL;
+	}
 
 	return xt_check_entry_match(elems, base + target_offset,
 				    __alignof__(struct xt_entry_match));
-- 
cgit v1.2.3


From 472ebdcd15ebdb8ebe20474ef1ce09abcb241e7d Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:28 +0100
Subject: netfilter: x_tables: check error target size too

Check that userspace ERROR target (custom user-defined chains) match
expected format, and the chain name is null terminated.

This is irrelevant for kernel, but iptables itself relies on sane input
when it dumps rules from kernel.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 2e4d423e58e6..f045bb4f7063 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -654,6 +654,11 @@ struct compat_xt_standard_target {
 	compat_uint_t verdict;
 };
 
+struct compat_xt_error_target {
+	struct compat_xt_entry_target t;
+	char errorname[XT_FUNCTION_MAXNAMELEN];
+};
+
 static bool verdict_ok(int verdict)
 {
 	if (verdict > 0)
@@ -679,6 +684,12 @@ static bool verdict_ok(int verdict)
 	return false;
 }
 
+static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
+			const char *msg, unsigned int msglen)
+{
+	return usersize == kernsize && strnlen(msg, msglen) < msglen;
+}
+
 int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
@@ -708,6 +719,12 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 
 		if (!verdict_ok(st->verdict))
 			return -EINVAL;
+	} else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) {
+		const struct compat_xt_error_target *et = (const void *)t;
+
+		if (!error_tg_ok(t->u.target_size, sizeof(*et),
+				 et->errorname, sizeof(et->errorname)))
+			return -EINVAL;
 	}
 
 	/* compat_xt_entry match has less strict alignment requirements,
@@ -796,6 +813,12 @@ int xt_check_entry_offsets(const void *base,
 
 		if (!verdict_ok(st->verdict))
 			return -EINVAL;
+	} else if (strcmp(t->u.user.name, XT_ERROR_TARGET) == 0) {
+		const struct xt_error_target *et = (const void *)t;
+
+		if (!error_tg_ok(t->u.target_size, sizeof(*et),
+				 et->errorname, sizeof(et->errorname)))
+			return -EINVAL;
 	}
 
 	return xt_check_entry_match(elems, base + target_offset,
-- 
cgit v1.2.3


From 1b293e30f759b03f246baae862bdf35e57b2c39e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:29 +0100
Subject: netfilter: x_tables: move hook entry checks into core

Allow followup patch to change on location instead of three.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 13 +++----------
 net/ipv4/netfilter/ip_tables.c  | 13 +++----------
 net/ipv6/netfilter/ip6_tables.c | 13 +++----------
 net/netfilter/x_tables.c        | 29 +++++++++++++++++++++++++++++
 4 files changed, 38 insertions(+), 30 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c9ffa884a4ee..be5821215ea0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -555,16 +555,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 	if (i != repl->num_entries)
 		goto out_free;
 
-	/* Check hooks all assigned */
-	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
-		/* Only hooks which are valid */
-		if (!(repl->valid_hooks & (1 << i)))
-			continue;
-		if (newinfo->hook_entry[i] == 0xFFFFFFFF)
-			goto out_free;
-		if (newinfo->underflow[i] == 0xFFFFFFFF)
-			goto out_free;
-	}
+	ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+	if (ret)
+		goto out_free;
 
 	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
 		ret = -ELOOP;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c9b57a6bf96a..29bda9484a33 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -702,16 +702,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 	if (i != repl->num_entries)
 		goto out_free;
 
-	/* Check hooks all assigned */
-	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		/* Only hooks which are valid */
-		if (!(repl->valid_hooks & (1 << i)))
-			continue;
-		if (newinfo->hook_entry[i] == 0xFFFFFFFF)
-			goto out_free;
-		if (newinfo->underflow[i] == 0xFFFFFFFF)
-			goto out_free;
-	}
+	ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+	if (ret)
+		goto out_free;
 
 	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
 		ret = -ELOOP;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index f46954221933..ba3776a4d305 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -720,16 +720,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 	if (i != repl->num_entries)
 		goto out_free;
 
-	/* Check hooks all assigned */
-	for (i = 0; i < NF_INET_NUMHOOKS; i++) {
-		/* Only hooks which are valid */
-		if (!(repl->valid_hooks & (1 << i)))
-			continue;
-		if (newinfo->hook_entry[i] == 0xFFFFFFFF)
-			goto out_free;
-		if (newinfo->underflow[i] == 0xFFFFFFFF)
-			goto out_free;
-	}
+	ret = xt_check_table_hooks(newinfo, repl->valid_hooks);
+	if (ret)
+		goto out_free;
 
 	if (!mark_source_chains(newinfo, repl->valid_hooks, entry0, offsets)) {
 		ret = -ELOOP;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index f045bb4f7063..5d8ba89a8da8 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -518,6 +518,35 @@ static int xt_check_entry_match(const char *match, const char *target,
 	return 0;
 }
 
+/** xt_check_table_hooks - check hook entry points are sane
+ *
+ * @info xt_table_info to check
+ * @valid_hooks - hook entry points that we can enter from
+ *
+ * Validates that the hook entry and underflows points are set up.
+ *
+ * Return: 0 on success, negative errno on failure.
+ */
+int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks)
+{
+	unsigned int i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(info->hook_entry) != ARRAY_SIZE(info->underflow));
+
+	for (i = 0; i < ARRAY_SIZE(info->hook_entry); i++) {
+		if (!(valid_hooks & (1 << i)))
+			continue;
+
+		if (info->hook_entry[i] == 0xFFFFFFFF)
+			return -EINVAL;
+		if (info->underflow[i] == 0xFFFFFFFF)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(xt_check_table_hooks);
+
 #ifdef CONFIG_COMPAT
 int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
-- 
cgit v1.2.3


From e816a2ce49e49e3906f614009c919334a0c6ba6a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:30 +0100
Subject: netfilter: x_tables: enforce unique and ascending entry points

Harmless from kernel point of view, but iptables assumes that this is
true when decoding a ruleset.

iptables walks the dumped blob from kernel, and, for each entry that
creates a new chain it prints out rule/chain information.
Base chains (hook entry points) are thus only shown when they appear
in the rule blob.  One base chain that is referenced multiple times
in hook blob is then only printed once.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 5d8ba89a8da8..4e6cbb38e616 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -529,10 +529,15 @@ static int xt_check_entry_match(const char *match, const char *target,
  */
 int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks)
 {
-	unsigned int i;
+	const char *err = "unsorted underflow";
+	unsigned int i, max_uflow, max_entry;
+	bool check_hooks = false;
 
 	BUILD_BUG_ON(ARRAY_SIZE(info->hook_entry) != ARRAY_SIZE(info->underflow));
 
+	max_entry = 0;
+	max_uflow = 0;
+
 	for (i = 0; i < ARRAY_SIZE(info->hook_entry); i++) {
 		if (!(valid_hooks & (1 << i)))
 			continue;
@@ -541,9 +546,33 @@ int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_ho
 			return -EINVAL;
 		if (info->underflow[i] == 0xFFFFFFFF)
 			return -EINVAL;
+
+		if (check_hooks) {
+			if (max_uflow > info->underflow[i])
+				goto error;
+
+			if (max_uflow == info->underflow[i]) {
+				err = "duplicate underflow";
+				goto error;
+			}
+			if (max_entry > info->hook_entry[i]) {
+				err = "unsorted entry";
+				goto error;
+			}
+			if (max_entry == info->hook_entry[i]) {
+				err = "duplicate entry";
+				goto error;
+			}
+		}
+		max_entry = info->hook_entry[i];
+		max_uflow = info->underflow[i];
+		check_hooks = true;
 	}
 
 	return 0;
+error:
+	pr_err_ratelimited("%s at hook %d\n", err, i);
+	return -EINVAL;
 }
 EXPORT_SYMBOL(xt_check_table_hooks);
 
-- 
cgit v1.2.3


From 19926968ea86a286aa6fbea16ee3f2e7442f10f0 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:31 +0100
Subject: netfilter: x_tables: cap allocations at 512 mbyte

Arbitrary limit, however, this still allows huge rulesets
(> 1 million rules).  This helps with automated fuzzer as it prevents
oom-killer invocation.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 4e6cbb38e616..dc68ac49614a 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -40,6 +40,7 @@ MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
 MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
 
 #define XT_PCPU_BLOCK_SIZE 4096
+#define XT_MAX_TABLE_SIZE	(512 * 1024 * 1024)
 
 struct compat_delta {
 	unsigned int offset; /* offset in kernel */
@@ -1117,7 +1118,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 	struct xt_table_info *info = NULL;
 	size_t sz = sizeof(*info) + size;
 
-	if (sz < sizeof(*info))
+	if (sz < sizeof(*info) || sz >= XT_MAX_TABLE_SIZE)
 		return NULL;
 
 	/* __GFP_NORETRY is not fully supported by kvmalloc but it should
-- 
cgit v1.2.3


From 9d5c12a7c08f67999772065afd50fb222072114e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:32 +0100
Subject: netfilter: x_tables: limit allocation requests for blob rule heads

This is a very conservative limit (134217728 rules), but good
enough to not trigger frequent oom from syzkaller.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index dc68ac49614a..01f8e122e74e 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -894,6 +894,9 @@ EXPORT_SYMBOL(xt_check_entry_offsets);
  */
 unsigned int *xt_alloc_entry_offsets(unsigned int size)
 {
+	if (size > XT_MAX_TABLE_SIZE / sizeof(unsigned int))
+		return NULL;
+
 	return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO);
 
 }
-- 
cgit v1.2.3


From c84ca954ac9fa67a6ce27f91f01e4451c74fd8f6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:33 +0100
Subject: netfilter: x_tables: add counters allocation wrapper

allows to have size checks in a single spot.
This is supposed to reduce oom situations when fuzz-testing xtables.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c |  2 +-
 net/ipv4/netfilter/ip_tables.c  |  2 +-
 net/ipv6/netfilter/ip6_tables.c |  2 +-
 net/netfilter/x_tables.c        | 15 +++++++++++++++
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index be5821215ea0..82ba09b50fdb 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -883,7 +883,7 @@ static int __do_replace(struct net *net, const char *name,
 	struct arpt_entry *iter;
 
 	ret = 0;
-	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	counters = xt_counters_alloc(num_counters);
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 29bda9484a33..4901ca6c3e09 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1045,7 +1045,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ipt_entry *iter;
 
 	ret = 0;
-	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	counters = xt_counters_alloc(num_counters);
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index ba3776a4d305..e84cec49b60f 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1063,7 +1063,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ip6t_entry *iter;
 
 	ret = 0;
-	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	counters = xt_counters_alloc(num_counters);
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 01f8e122e74e..82b1f8f52ac6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1290,6 +1290,21 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	return 0;
 }
 
+struct xt_counters *xt_counters_alloc(unsigned int counters)
+{
+	struct xt_counters *mem;
+
+	if (counters == 0 || counters > INT_MAX / sizeof(*mem))
+		return NULL;
+
+	counters *= sizeof(*mem);
+	if (counters > XT_MAX_TABLE_SIZE)
+		return NULL;
+
+	return vzalloc(counters);
+}
+EXPORT_SYMBOL(xt_counters_alloc);
+
 struct xt_table_info *
 xt_replace_table(struct xt_table *table,
 	      unsigned int num_counters,
-- 
cgit v1.2.3


From 9782a11efc072faaf91d4aa60e9d23553f918029 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:34 +0100
Subject: netfilter: compat: prepare xt_compat_init_offsets to return errors

should have no impact, function still always returns 0.
This patch is only to ease review.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 10 ++++++++--
 net/ipv4/netfilter/arp_tables.c | 10 +++++++---
 net/ipv4/netfilter/ip_tables.c  |  8 ++++++--
 net/ipv6/netfilter/ip6_tables.c | 10 +++++++---
 net/netfilter/x_tables.c        |  4 +++-
 5 files changed, 31 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 02c4b409d317..217aa79f7b2a 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1819,10 +1819,14 @@ static int compat_table_info(const struct ebt_table_info *info,
 {
 	unsigned int size = info->entries_size;
 	const void *entries = info->entries;
+	int ret;
 
 	newinfo->entries_size = size;
 
-	xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries);
+	ret = xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries);
+	if (ret)
+		return ret;
+
 	return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
 							entries, newinfo);
 }
@@ -2245,7 +2249,9 @@ static int compat_do_replace(struct net *net, void __user *user,
 
 	xt_compat_lock(NFPROTO_BRIDGE);
 
-	xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
+	ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
+	if (ret < 0)
+		goto out_unlock;
 	ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
 	if (ret < 0)
 		goto out_unlock;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 82ba09b50fdb..aaafdbd15ad3 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -769,7 +769,9 @@ static int compat_table_info(const struct xt_table_info *info,
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
 	loc_cpu_entry = info->entries;
-	xt_compat_init_offsets(NFPROTO_ARP, info->number);
+	ret = xt_compat_init_offsets(NFPROTO_ARP, info->number);
+	if (ret)
+		return ret;
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
 		if (ret != 0)
@@ -1156,7 +1158,7 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 	struct compat_arpt_entry *iter0;
 	struct arpt_replace repl;
 	unsigned int size;
-	int ret = 0;
+	int ret;
 
 	info = *pinfo;
 	entry0 = *pentry0;
@@ -1165,7 +1167,9 @@ static int translate_compat_table(struct xt_table_info **pinfo,
 
 	j = 0;
 	xt_compat_lock(NFPROTO_ARP);
-	xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries);
+	ret = xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries);
+	if (ret)
+		goto out_unlock;
 	/* Walk through entries, checking offsets. */
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 4901ca6c3e09..f9063513f9d1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -933,7 +933,9 @@ static int compat_table_info(const struct xt_table_info *info,
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
 	loc_cpu_entry = info->entries;
-	xt_compat_init_offsets(AF_INET, info->number);
+	ret = xt_compat_init_offsets(AF_INET, info->number);
+	if (ret)
+		return ret;
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
 		if (ret != 0)
@@ -1407,7 +1409,9 @@ translate_compat_table(struct net *net,
 
 	j = 0;
 	xt_compat_lock(AF_INET);
-	xt_compat_init_offsets(AF_INET, compatr->num_entries);
+	ret = xt_compat_init_offsets(AF_INET, compatr->num_entries);
+	if (ret)
+		goto out_unlock;
 	/* Walk through entries, checking offsets. */
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index e84cec49b60f..3c36a4c77f29 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -950,7 +950,9 @@ static int compat_table_info(const struct xt_table_info *info,
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
 	loc_cpu_entry = info->entries;
-	xt_compat_init_offsets(AF_INET6, info->number);
+	ret = xt_compat_init_offsets(AF_INET6, info->number);
+	if (ret)
+		return ret;
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
 		if (ret != 0)
@@ -1414,7 +1416,7 @@ translate_compat_table(struct net *net,
 	struct compat_ip6t_entry *iter0;
 	struct ip6t_replace repl;
 	unsigned int size;
-	int ret = 0;
+	int ret;
 
 	info = *pinfo;
 	entry0 = *pentry0;
@@ -1423,7 +1425,9 @@ translate_compat_table(struct net *net,
 
 	j = 0;
 	xt_compat_lock(AF_INET6);
-	xt_compat_init_offsets(AF_INET6, compatr->num_entries);
+	ret = xt_compat_init_offsets(AF_INET6, compatr->num_entries);
+	if (ret)
+		goto out_unlock;
 	/* Walk through entries, checking offsets. */
 	xt_entry_foreach(iter0, entry0, compatr->size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 82b1f8f52ac6..e878c85a9268 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -632,10 +632,12 @@ int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
 }
 EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
 
-void xt_compat_init_offsets(u_int8_t af, unsigned int number)
+int xt_compat_init_offsets(u8 af, unsigned int number)
 {
 	xt[af].number = number;
 	xt[af].cur = 0;
+
+	return 0;
 }
 EXPORT_SYMBOL(xt_compat_init_offsets);
 
-- 
cgit v1.2.3


From 7d7d7e02111e9a4dc9d0658597f528f815d820fd Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:35 +0100
Subject: netfilter: compat: reject huge allocation requests

no need to bother even trying to allocating huge compat offset arrays,
such ruleset is rejected later on anyway becaus we refuse to allocate
overly large rule blobs.

However, compat translation happens before blob allocation, so we should
add a check there too.

This is supposed to help with fuzzing by avoiding oom-killer.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index e878c85a9268..33724b08b8f0 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -582,14 +582,8 @@ int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
 	struct xt_af *xp = &xt[af];
 
-	if (!xp->compat_tab) {
-		if (!xp->number)
-			return -EINVAL;
-		xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
-		if (!xp->compat_tab)
-			return -ENOMEM;
-		xp->cur = 0;
-	}
+	if (WARN_ON(!xp->compat_tab))
+		return -ENOMEM;
 
 	if (xp->cur >= xp->number)
 		return -EINVAL;
@@ -634,6 +628,22 @@ EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
 
 int xt_compat_init_offsets(u8 af, unsigned int number)
 {
+	size_t mem;
+
+	if (!number || number > (INT_MAX / sizeof(struct compat_delta)))
+		return -EINVAL;
+
+	if (WARN_ON(xt[af].compat_tab))
+		return -EINVAL;
+
+	mem = sizeof(struct compat_delta) * number;
+	if (mem > XT_MAX_TABLE_SIZE)
+		return -ENOMEM;
+
+	xt[af].compat_tab = vmalloc(mem);
+	if (!xt[af].compat_tab)
+		return -ENOMEM;
+
 	xt[af].number = number;
 	xt[af].cur = 0;
 
-- 
cgit v1.2.3


From 89370860686a54fc0642c7ae68213cc1fc6d8e04 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:36 +0100
Subject: netfilter: x_tables: make sure compat af mutex is held

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 33724b08b8f0..7521e8a72c06 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -582,6 +582,8 @@ int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
 	struct xt_af *xp = &xt[af];
 
+	WARN_ON(!mutex_is_locked(&xt[af].compat_mutex));
+
 	if (WARN_ON(!xp->compat_tab))
 		return -ENOMEM;
 
@@ -599,6 +601,8 @@ EXPORT_SYMBOL_GPL(xt_compat_add_offset);
 
 void xt_compat_flush_offsets(u_int8_t af)
 {
+	WARN_ON(!mutex_is_locked(&xt[af].compat_mutex));
+
 	if (xt[af].compat_tab) {
 		vfree(xt[af].compat_tab);
 		xt[af].compat_tab = NULL;
@@ -630,6 +634,8 @@ int xt_compat_init_offsets(u8 af, unsigned int number)
 {
 	size_t mem;
 
+	WARN_ON(!mutex_is_locked(&xt[af].compat_mutex));
+
 	if (!number || number > (INT_MAX / sizeof(struct compat_delta)))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 0d7df906a0e78079a02108b06d32c3ef2238ad25 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 27 Feb 2018 19:42:37 +0100
Subject: netfilter: x_tables: ensure last rule in base chain matches
 underflow/policy

Harmless from kernel point of view, but again iptables assumes that
this is true when decoding ruleset coming from kernel.

If a (syzkaller generated) ruleset doesn't have the underflow/policy
stored as the last rule in the base chain, then iptables will abort()
because it doesn't find the chain policy.

libiptc assumes that the policy is the last rule in the basechain, which
is only true for iptables-generated rulesets.

Unfortunately this needs code duplication -- the functions need the
struct layout of the rule head, but that is different for
ip/ip6/arptables.

NB: pr_warn could be pr_debug but in case this break rulesets somehow its
useful to know why blob was rejected.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 17 ++++++++++++++++-
 net/ipv4/netfilter/ip_tables.c  | 17 ++++++++++++++++-
 net/ipv6/netfilter/ip6_tables.c | 17 ++++++++++++++++-
 3 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index aaafdbd15ad3..f366ff1cfc19 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -309,10 +309,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 	for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct arpt_entry *e = entry0 + pos;
+		unsigned int last_pos, depth;
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
 
+		depth = 0;
+		last_pos = pos;
 		/* Set initial back pointer. */
 		e->counters.pcnt = pos;
 
@@ -343,6 +346,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 					pos = e->counters.pcnt;
 					e->counters.pcnt = 0;
 
+					if (depth)
+						--depth;
 					/* We're at the start. */
 					if (pos == oldpos)
 						goto next;
@@ -367,6 +372,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
+
+					if (entry0 + newpos != arpt_next_entry(e))
+						++depth;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
@@ -377,8 +385,15 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
+			if (depth == 0)
+				last_pos = pos;
+		}
+next:
+		if (last_pos != newinfo->underflow[hook]) {
+			pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n",
+					   last_pos, newinfo->underflow[hook], hook);
+			return 0;
 		}
-next:		;
 	}
 	return 1;
 }
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index f9063513f9d1..2362ca2c9e0c 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -378,10 +378,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ipt_entry *e = entry0 + pos;
+		unsigned int last_pos, depth;
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
 
+		depth = 0;
+		last_pos = pos;
 		/* Set initial back pointer. */
 		e->counters.pcnt = pos;
 
@@ -410,6 +413,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					pos = e->counters.pcnt;
 					e->counters.pcnt = 0;
 
+					if (depth)
+						--depth;
 					/* We're at the start. */
 					if (pos == oldpos)
 						goto next;
@@ -434,6 +439,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
+
+					if (entry0 + newpos != ipt_next_entry(e))
+						++depth;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
@@ -444,8 +452,15 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
+			if (depth == 0)
+				last_pos = pos;
+		}
+next:
+		if (last_pos != newinfo->underflow[hook]) {
+			pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n",
+					   last_pos, newinfo->underflow[hook], hook);
+			return 0;
 		}
-next:		;
 	}
 	return 1;
 }
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 3c36a4c77f29..004508753abc 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -396,10 +396,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ip6t_entry *e = entry0 + pos;
+		unsigned int last_pos, depth;
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
 
+		depth = 0;
+		last_pos = pos;
 		/* Set initial back pointer. */
 		e->counters.pcnt = pos;
 
@@ -428,6 +431,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					pos = e->counters.pcnt;
 					e->counters.pcnt = 0;
 
+					if (depth)
+						--depth;
 					/* We're at the start. */
 					if (pos == oldpos)
 						goto next;
@@ -452,6 +457,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
+
+					if (entry0 + newpos != ip6t_next_entry(e))
+						++depth;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
@@ -462,8 +470,15 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
+			if (depth == 0)
+				last_pos = pos;
+		}
+next:
+		if (last_pos != newinfo->underflow[hook]) {
+			pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n",
+					   last_pos, newinfo->underflow[hook], hook);
+			return 0;
 		}
-next:		;
 	}
 	return 1;
 }
-- 
cgit v1.2.3


From 3427b2ab63faccafe774ea997fc2da7faf690c5a Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 1 Mar 2018 18:58:38 -0800
Subject: netfilter: make xt_rateest hash table per net

As suggested by Eric, we need to make the xt_rateest
hash table and its lock per netns to reduce lock
contentions.

Cc: Florian Westphal <fw@strlen.de>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_RATEEST.c | 91 +++++++++++++++++++++++++++++++++-------------
 net/netfilter/xt_rateest.c | 10 ++---
 2 files changed, 70 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c
index 141c295191f6..dec843cadf46 100644
--- a/net/netfilter/xt_RATEEST.c
+++ b/net/netfilter/xt_RATEEST.c
@@ -14,15 +14,21 @@
 #include <linux/slab.h>
 #include <net/gen_stats.h>
 #include <net/netlink.h>
+#include <net/netns/generic.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_RATEEST.h>
 #include <net/netfilter/xt_rateest.h>
 
-static DEFINE_MUTEX(xt_rateest_mutex);
-
 #define RATEEST_HSIZE	16
-static struct hlist_head rateest_hash[RATEEST_HSIZE] __read_mostly;
+
+struct xt_rateest_net {
+	struct mutex hash_lock;
+	struct hlist_head hash[RATEEST_HSIZE];
+};
+
+static unsigned int xt_rateest_id;
+
 static unsigned int jhash_rnd __read_mostly;
 
 static unsigned int xt_rateest_hash(const char *name)
@@ -31,21 +37,23 @@ static unsigned int xt_rateest_hash(const char *name)
 	       (RATEEST_HSIZE - 1);
 }
 
-static void xt_rateest_hash_insert(struct xt_rateest *est)
+static void xt_rateest_hash_insert(struct xt_rateest_net *xn,
+				   struct xt_rateest *est)
 {
 	unsigned int h;
 
 	h = xt_rateest_hash(est->name);
-	hlist_add_head(&est->list, &rateest_hash[h]);
+	hlist_add_head(&est->list, &xn->hash[h]);
 }
 
-static struct xt_rateest *__xt_rateest_lookup(const char *name)
+static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn,
+					      const char *name)
 {
 	struct xt_rateest *est;
 	unsigned int h;
 
 	h = xt_rateest_hash(name);
-	hlist_for_each_entry(est, &rateest_hash[h], list) {
+	hlist_for_each_entry(est, &xn->hash[h], list) {
 		if (strcmp(est->name, name) == 0) {
 			est->refcnt++;
 			return est;
@@ -55,20 +63,23 @@ static struct xt_rateest *__xt_rateest_lookup(const char *name)
 	return NULL;
 }
 
-struct xt_rateest *xt_rateest_lookup(const char *name)
+struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name)
 {
+	struct xt_rateest_net *xn = net_generic(net, xt_rateest_id);
 	struct xt_rateest *est;
 
-	mutex_lock(&xt_rateest_mutex);
-	est = __xt_rateest_lookup(name);
-	mutex_unlock(&xt_rateest_mutex);
+	mutex_lock(&xn->hash_lock);
+	est = __xt_rateest_lookup(xn, name);
+	mutex_unlock(&xn->hash_lock);
 	return est;
 }
 EXPORT_SYMBOL_GPL(xt_rateest_lookup);
 
-void xt_rateest_put(struct xt_rateest *est)
+void xt_rateest_put(struct net *net, struct xt_rateest *est)
 {
-	mutex_lock(&xt_rateest_mutex);
+	struct xt_rateest_net *xn = net_generic(net, xt_rateest_id);
+
+	mutex_lock(&xn->hash_lock);
 	if (--est->refcnt == 0) {
 		hlist_del(&est->list);
 		gen_kill_estimator(&est->rate_est);
@@ -78,7 +89,7 @@ void xt_rateest_put(struct xt_rateest *est)
 		 */
 		kfree_rcu(est, rcu);
 	}
-	mutex_unlock(&xt_rateest_mutex);
+	mutex_unlock(&xn->hash_lock);
 }
 EXPORT_SYMBOL_GPL(xt_rateest_put);
 
@@ -98,6 +109,7 @@ xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par)
 
 static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 {
+	struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id);
 	struct xt_rateest_target_info *info = par->targinfo;
 	struct xt_rateest *est;
 	struct {
@@ -108,10 +120,10 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 
 	net_get_random_once(&jhash_rnd, sizeof(jhash_rnd));
 
-	mutex_lock(&xt_rateest_mutex);
-	est = __xt_rateest_lookup(info->name);
+	mutex_lock(&xn->hash_lock);
+	est = __xt_rateest_lookup(xn, info->name);
 	if (est) {
-		mutex_unlock(&xt_rateest_mutex);
+		mutex_unlock(&xn->hash_lock);
 		/*
 		 * If estimator parameters are specified, they must match the
 		 * existing estimator.
@@ -119,7 +131,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 		if ((!info->interval && !info->ewma_log) ||
 		    (info->interval != est->params.interval ||
 		     info->ewma_log != est->params.ewma_log)) {
-			xt_rateest_put(est);
+			xt_rateest_put(par->net, est);
 			return -EINVAL;
 		}
 		info->est = est;
@@ -148,14 +160,14 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
 		goto err2;
 
 	info->est = est;
-	xt_rateest_hash_insert(est);
-	mutex_unlock(&xt_rateest_mutex);
+	xt_rateest_hash_insert(xn, est);
+	mutex_unlock(&xn->hash_lock);
 	return 0;
 
 err2:
 	kfree(est);
 err1:
-	mutex_unlock(&xt_rateest_mutex);
+	mutex_unlock(&xn->hash_lock);
 	return ret;
 }
 
@@ -163,7 +175,7 @@ static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par)
 {
 	struct xt_rateest_target_info *info = par->targinfo;
 
-	xt_rateest_put(info->est);
+	xt_rateest_put(par->net, info->est);
 }
 
 static struct xt_target xt_rateest_tg_reg __read_mostly = {
@@ -178,19 +190,46 @@ static struct xt_target xt_rateest_tg_reg __read_mostly = {
 	.me         = THIS_MODULE,
 };
 
-static int __init xt_rateest_tg_init(void)
+static __net_init int xt_rateest_net_init(struct net *net)
+{
+	struct xt_rateest_net *xn = net_generic(net, xt_rateest_id);
+	int i;
+
+	mutex_init(&xn->hash_lock);
+	for (i = 0; i < ARRAY_SIZE(xn->hash); i++)
+		INIT_HLIST_HEAD(&xn->hash[i]);
+	return 0;
+}
+
+static void __net_exit xt_rateest_net_exit(struct net *net)
 {
-	unsigned int i;
+	struct xt_rateest_net *xn = net_generic(net, xt_rateest_id);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(xn->hash); i++)
+		WARN_ON_ONCE(!hlist_empty(&xn->hash[i]));
+}
 
-	for (i = 0; i < ARRAY_SIZE(rateest_hash); i++)
-		INIT_HLIST_HEAD(&rateest_hash[i]);
+static struct pernet_operations xt_rateest_net_ops = {
+	.init = xt_rateest_net_init,
+	.exit = xt_rateest_net_exit,
+	.id   = &xt_rateest_id,
+	.size = sizeof(struct xt_rateest_net),
+};
+
+static int __init xt_rateest_tg_init(void)
+{
+	int err = register_pernet_subsys(&xt_rateest_net_ops);
 
+	if (err)
+		return err;
 	return xt_register_target(&xt_rateest_tg_reg);
 }
 
 static void __exit xt_rateest_tg_fini(void)
 {
 	xt_unregister_target(&xt_rateest_tg_reg);
+	unregister_pernet_subsys(&xt_rateest_net_ops);
 }
 
 
diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c
index 755d2f6693a2..bf77326861af 100644
--- a/net/netfilter/xt_rateest.c
+++ b/net/netfilter/xt_rateest.c
@@ -95,13 +95,13 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
 	}
 
 	ret  = -ENOENT;
-	est1 = xt_rateest_lookup(info->name1);
+	est1 = xt_rateest_lookup(par->net, info->name1);
 	if (!est1)
 		goto err1;
 
 	est2 = NULL;
 	if (info->flags & XT_RATEEST_MATCH_REL) {
-		est2 = xt_rateest_lookup(info->name2);
+		est2 = xt_rateest_lookup(par->net, info->name2);
 		if (!est2)
 			goto err2;
 	}
@@ -111,7 +111,7 @@ static int xt_rateest_mt_checkentry(const struct xt_mtchk_param *par)
 	return 0;
 
 err2:
-	xt_rateest_put(est1);
+	xt_rateest_put(par->net, est1);
 err1:
 	return ret;
 }
@@ -120,9 +120,9 @@ static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par)
 {
 	struct xt_rateest_match_info *info = par->matchinfo;
 
-	xt_rateest_put(info->est1);
+	xt_rateest_put(par->net, info->est1);
 	if (info->est2)
-		xt_rateest_put(info->est2);
+		xt_rateest_put(par->net, info->est2);
 }
 
 static struct xt_match xt_rateest_mt_reg __read_mostly = {
-- 
cgit v1.2.3


From 010eacd968a73ddcb8592b14c1607e1004120ede Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 2 Mar 2018 14:59:54 +0100
Subject: netfilter: xt_limit: Spelling s/maxmum/maximum/

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_limit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index 55d18cd67635..9f098ecb2449 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -46,7 +46,7 @@ MODULE_ALIAS("ip6t_limit");
 
    See Alexey's formal explanation in net/sched/sch_tbf.c.
 
-   To get the maxmum range, we multiply by this factor (ie. you get N
+   To get the maximum range, we multiply by this factor (ie. you get N
    credits per jiffy).  We want to allow a rate as low as 1 per day
    (slowest userspace tool allows), which means
    CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
-- 
cgit v1.2.3


From 415a13296a1a49639cabf9d2fe92267810caa47a Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Mon, 5 Mar 2018 15:49:59 -0600
Subject: xfrm_policy: use true and false for boolean values

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 12bd415d349e..2b7babb66175 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1740,7 +1740,7 @@ static void xfrm_pcpu_work_fn(struct work_struct *work)
 void xfrm_policy_cache_flush(void)
 {
 	struct xfrm_dst *old;
-	bool found = 0;
+	bool found = false;
 	int cpu;
 
 	might_sleep();
-- 
cgit v1.2.3


From 30855ffc29b972d3316e2adcf8029c00c36fad60 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 1 Mar 2018 15:23:28 +0300
Subject: net: Make account struct net to memcg

The patch adds SLAB_ACCOUNT to flags of net_cachep cache,
which enables accounting of struct net memory to memcg kmem.
Since number of net_namespaces may be significant, user
want to know, how much there were consumed, and control.

Note, that we do not account net_generic to the same memcg,
where net was accounted, moreover, we don't do this at all (*).
We do not want the situation, when single memcg memory deficit
prevents us to register new pernet_operations.

(*)Even despite there is !current process accounting already
available in linux-next. See kmalloc_memcg() there for the details.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 690e78c6af45..c340d5cfbdec 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -882,7 +882,7 @@ static int __init net_ns_init(void)
 #ifdef CONFIG_NET_NS
 	net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
 					SMP_CACHE_BYTES,
-					SLAB_PANIC, NULL);
+					SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	/* Create workqueue for cleanup */
 	netns_wq = create_singlethread_workqueue("netns");
-- 
cgit v1.2.3


From ed63afb8a318f6b3558d76afba7809daee4f28e5 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 5 Mar 2018 20:44:18 +0800
Subject: sctp: add support for PR-SCTP Information for sendmsg

This patch is to add support for PR-SCTP Information for sendmsg,
as described in section 5.3.7 of RFC6458.

With this option, you can specify pr_policy and pr_value for user
data in sendmsg.

It's also a necessary send info for sctp_sendv.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 7fa76031bb08..fdde697b37e7 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1644,6 +1644,12 @@ static int sctp_sendmsg_parse(struct sock *sk, struct sctp_cmsgs *cmsgs,
 		srinfo->sinfo_assoc_id = cmsgs->sinfo->snd_assoc_id;
 	}
 
+	if (cmsgs->prinfo) {
+		srinfo->sinfo_timetolive = cmsgs->prinfo->pr_value;
+		SCTP_PR_SET_POLICY(srinfo->sinfo_flags,
+				   cmsgs->prinfo->pr_policy);
+	}
+
 	sflags = srinfo->sinfo_flags;
 	if (!sflags && msg_len)
 		return 0;
@@ -1901,9 +1907,12 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc,
 		sinfo->sinfo_ppid = asoc->default_ppid;
 		sinfo->sinfo_context = asoc->default_context;
 		sinfo->sinfo_assoc_id = sctp_assoc2id(asoc);
+
+		if (!cmsgs->prinfo)
+			sinfo->sinfo_flags = asoc->default_flags;
 	}
 
-	if (!cmsgs->srinfo)
+	if (!cmsgs->srinfo && !cmsgs->prinfo)
 		sinfo->sinfo_timetolive = asoc->default_timetolive;
 }
 
@@ -7749,6 +7758,26 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
 			      SCTP_ABORT | SCTP_EOF))
 				return -EINVAL;
 			break;
+		case SCTP_PRINFO:
+			/* SCTP Socket API Extension
+			 * 5.3.7 SCTP PR-SCTP Information Structure (SCTP_PRINFO)
+			 *
+			 * This cmsghdr structure specifies SCTP options for sendmsg().
+			 *
+			 * cmsg_level    cmsg_type      cmsg_data[]
+			 * ------------  ------------   ---------------------
+			 * IPPROTO_SCTP  SCTP_PRINFO    struct sctp_prinfo
+			 */
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_prinfo)))
+				return -EINVAL;
+
+			cmsgs->prinfo = CMSG_DATA(cmsg);
+			if (cmsgs->prinfo->pr_policy & ~SCTP_PR_SCTP_MASK)
+				return -EINVAL;
+
+			if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE)
+				cmsgs->prinfo->pr_value = 0;
+			break;
 		default:
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From 2c0dbaa0c43d04d8d6daf52adb724c5789676b15 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 5 Mar 2018 20:44:19 +0800
Subject: sctp: add support for SCTP_DSTADDRV4/6 Information for sendmsg

This patch is to add support for Destination IPv4/6 Address options
for sendmsg, as described in section 5.3.9/10 of RFC6458.

With this option, you can provide more than one destination addrs
to sendmsg when creating asoc, like sctp_connectx.

It's also a necessary send info for sctp_sendv.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index fdde697b37e7..067b57a330b1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1676,6 +1676,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 	struct net *net = sock_net(sk);
 	struct sctp_association *asoc;
 	enum sctp_scope scope;
+	struct cmsghdr *cmsg;
 	int err = -EINVAL;
 
 	*tp = NULL;
@@ -1741,6 +1742,67 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 		goto free;
 	}
 
+	if (!cmsgs->addrs_msg)
+		return 0;
+
+	/* sendv addr list parse */
+	for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
+		struct sctp_transport *transport;
+		struct sctp_association *old;
+		union sctp_addr _daddr;
+		int dlen;
+
+		if (cmsg->cmsg_level != IPPROTO_SCTP ||
+		    (cmsg->cmsg_type != SCTP_DSTADDRV4 &&
+		     cmsg->cmsg_type != SCTP_DSTADDRV6))
+			continue;
+
+		daddr = &_daddr;
+		memset(daddr, 0, sizeof(*daddr));
+		dlen = cmsg->cmsg_len - sizeof(struct cmsghdr);
+		if (cmsg->cmsg_type == SCTP_DSTADDRV4) {
+			if (dlen < sizeof(struct in_addr))
+				goto free;
+
+			dlen = sizeof(struct in_addr);
+			daddr->v4.sin_family = AF_INET;
+			daddr->v4.sin_port = htons(asoc->peer.port);
+			memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen);
+		} else {
+			if (dlen < sizeof(struct in6_addr))
+				goto free;
+
+			dlen = sizeof(struct in6_addr);
+			daddr->v6.sin6_family = AF_INET6;
+			daddr->v6.sin6_port = htons(asoc->peer.port);
+			memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
+		}
+		err = sctp_verify_addr(sk, daddr, sizeof(*daddr));
+		if (err)
+			goto free;
+
+		old = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
+		if (old && old != asoc) {
+			if (old->state >= SCTP_STATE_ESTABLISHED)
+				err = -EISCONN;
+			else
+				err = -EALREADY;
+			goto free;
+		}
+
+		if (sctp_endpoint_is_peeled_off(ep, daddr)) {
+			err = -EADDRNOTAVAIL;
+			goto free;
+		}
+
+		transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL,
+						SCTP_UNKNOWN);
+		if (!transport) {
+			err = -ENOMEM;
+			goto free;
+		}
+	}
+
 	return 0;
 
 free:
@@ -7778,6 +7840,21 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
 			if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE)
 				cmsgs->prinfo->pr_value = 0;
 			break;
+		case SCTP_DSTADDRV4:
+		case SCTP_DSTADDRV6:
+			/* SCTP Socket API Extension
+			 * 5.3.9/10 SCTP Destination IPv4/6 Address Structure (SCTP_DSTADDRV4/6)
+			 *
+			 * This cmsghdr structure specifies SCTP options for sendmsg().
+			 *
+			 * cmsg_level    cmsg_type         cmsg_data[]
+			 * ------------  ------------   ---------------------
+			 * IPPROTO_SCTP  SCTP_DSTADDRV4 struct in_addr
+			 * ------------  ------------   ---------------------
+			 * IPPROTO_SCTP  SCTP_DSTADDRV6 struct in6_addr
+			 */
+			cmsgs->addrs_msg = my_msg;
+			break;
 		default:
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From 4910280503f3af2857d5aa77e35b22d93a8960a8 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 5 Mar 2018 20:44:20 +0800
Subject: sctp: add support for snd flag SCTP_SENDALL process in sendmsg

This patch is to add support for snd flag SCTP_SENDALL process
in sendmsg, as described in section 5.3.4 of RFC6458.

With this flag, you can send the same data to all the asocs of
this sk once.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 067b57a330b1..7d3476a4860d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1820,6 +1820,10 @@ static int sctp_sendmsg_check_sflags(struct sctp_association *asoc,
 	if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP))
 		return -EPIPE;
 
+	if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP) &&
+	    !sctp_state(asoc, ESTABLISHED))
+		return 0;
+
 	if (sflags & SCTP_EOF) {
 		pr_debug("%s: shutting down association:%p\n", __func__, asoc);
 		sctp_primitive_SHUTDOWN(net, asoc, NULL);
@@ -2007,6 +2011,29 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
 
 	lock_sock(sk);
 
+	/* SCTP_SENDALL process */
+	if ((sflags & SCTP_SENDALL) && sctp_style(sk, UDP)) {
+		list_for_each_entry(asoc, &ep->asocs, asocs) {
+			err = sctp_sendmsg_check_sflags(asoc, sflags, msg,
+							msg_len);
+			if (err == 0)
+				continue;
+			if (err < 0)
+				goto out_unlock;
+
+			sctp_sendmsg_update_sinfo(asoc, sinfo, &cmsgs);
+
+			err = sctp_sendmsg_to_asoc(asoc, msg, msg_len,
+						   NULL, sinfo);
+			if (err < 0)
+				goto out_unlock;
+
+			iov_iter_revert(&msg->msg_iter, err);
+		}
+
+		goto out_unlock;
+	}
+
 	/* Get and check or create asoc */
 	if (daddr) {
 		asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
@@ -7792,8 +7819,8 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
 
 			if (cmsgs->srinfo->sinfo_flags &
 			    ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
-			      SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK |
-			      SCTP_ABORT | SCTP_EOF))
+			      SCTP_SACK_IMMEDIATELY | SCTP_SENDALL |
+			      SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF))
 				return -EINVAL;
 			break;
 
@@ -7816,8 +7843,8 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
 
 			if (cmsgs->sinfo->snd_flags &
 			    ~(SCTP_UNORDERED | SCTP_ADDR_OVER |
-			      SCTP_SACK_IMMEDIATELY | SCTP_PR_SCTP_MASK |
-			      SCTP_ABORT | SCTP_EOF))
+			      SCTP_SACK_IMMEDIATELY | SCTP_SENDALL |
+			      SCTP_PR_SCTP_MASK | SCTP_ABORT | SCTP_EOF))
 				return -EINVAL;
 			break;
 		case SCTP_PRINFO:
-- 
cgit v1.2.3


From c9efb15a9981400c797665dc4844ab562ed3c424 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Mon, 5 Mar 2018 15:56:14 -0600
Subject: tipc: bcast: use true and false for boolean values

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bcast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 37892b3909af..f3711176be45 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -574,5 +574,5 @@ void tipc_nlist_purge(struct tipc_nlist *nl)
 {
 	tipc_dest_list_purge(&nl->list);
 	nl->remote = 0;
-	nl->local = 0;
+	nl->local = false;
 }
-- 
cgit v1.2.3


From 9a21ac943240d0353b726495d08e377a257ace52 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Mon, 5 Mar 2018 16:11:54 -0600
Subject: ipv6: ndisc: use true and false for boolean values

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ndisc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0a19ce3a6f7f..8af5eef464c1 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -527,7 +527,7 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
 	}
 
 	if (!dev->addr_len)
-		inc_opt = 0;
+		inc_opt = false;
 	if (inc_opt)
 		optlen += ndisc_opt_addr_space(dev,
 					       NDISC_NEIGHBOUR_ADVERTISEMENT);
-- 
cgit v1.2.3


From 334e6413134bf8335ec93a9e60213cbc61ef7a62 Mon Sep 17 00:00:00 2001
From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Date: Wed, 7 Mar 2018 09:40:57 -0800
Subject: sock: Fix SO_ZEROCOPY switch case

Fix the SO_ZEROCOPY switch case on sock_setsockopt() avoiding the
ret values to be overwritten by the one set on the default case.

Fixes: 28190752c7092 ("sock: permit SO_ZEROCOPY on PF_RDS socket")
Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 507d8c6c4319..27f218bba43f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1062,8 +1062,9 @@ set_rcvbuf:
 				ret = -EINVAL;
 			else
 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
-			break;
 		}
+		break;
+
 	default:
 		ret = -ENOPROTOOPT;
 		break;
-- 
cgit v1.2.3


From d40a126b16ea851f858f97b0e214d97c8355cecb Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 6 Mar 2018 07:22:33 -0800
Subject: rds: refactor zcopy code into rds_message_zcopy_from_user

Move the large block of code predicated on zcopy from
rds_message_copy_from_user into a new function,
rds_message_zcopy_from_user()

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/message.c | 108 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 60 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 116cf87ccb89..c36edbb92c34 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -333,14 +333,14 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	return rm;
 }
 
-int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
-			       bool zcopy)
+int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
 {
-	unsigned long to_copy, nbytes;
 	unsigned long sg_off;
 	struct scatterlist *sg;
 	int ret = 0;
 	int length = iov_iter_count(from);
+	int total_copied = 0;
+	struct sk_buff *skb;
 
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
 
@@ -350,54 +350,66 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
 	sg = rm->data.op_sg;
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
-	if (zcopy) {
-		int total_copied = 0;
-		struct sk_buff *skb;
-
-		skb = alloc_skb(0, GFP_KERNEL);
-		if (!skb)
-			return -ENOMEM;
-		BUILD_BUG_ON(sizeof(skb->cb) <
-			     max_t(int, sizeof(struct rds_znotifier),
-				   sizeof(struct rds_zcopy_cookies)));
-		rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
-		if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
-					    length)) {
-			ret = -ENOMEM;
+	skb = alloc_skb(0, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	BUILD_BUG_ON(sizeof(skb->cb) < max_t(int, sizeof(struct rds_znotifier),
+					     sizeof(struct rds_zcopy_cookies)));
+	rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
+	if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
+				    length)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	while (iov_iter_count(from)) {
+		struct page *pages;
+		size_t start;
+		ssize_t copied;
+
+		copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
+					    1, &start);
+		if (copied < 0) {
+			struct mmpin *mmp;
+			int i;
+
+			for (i = 0; i < rm->data.op_nents; i++)
+				put_page(sg_page(&rm->data.op_sg[i]));
+			mmp = &rm->data.op_mmp_znotifier->z_mmp;
+			mm_unaccount_pinned_pages(mmp);
+			ret = -EFAULT;
 			goto err;
 		}
-		while (iov_iter_count(from)) {
-			struct page *pages;
-			size_t start;
-			ssize_t copied;
-
-			copied = iov_iter_get_pages(from, &pages, PAGE_SIZE,
-						    1, &start);
-			if (copied < 0) {
-				struct mmpin *mmp;
-				int i;
-
-				for (i = 0; i < rm->data.op_nents; i++)
-					put_page(sg_page(&rm->data.op_sg[i]));
-				mmp = &rm->data.op_mmp_znotifier->z_mmp;
-				mm_unaccount_pinned_pages(mmp);
-				ret = -EFAULT;
-				goto err;
-			}
-			total_copied += copied;
-			iov_iter_advance(from, copied);
-			length -= copied;
-			sg_set_page(sg, pages, copied, start);
-			rm->data.op_nents++;
-			sg++;
-		}
-		WARN_ON_ONCE(length != 0);
-		return ret;
+		total_copied += copied;
+		iov_iter_advance(from, copied);
+		length -= copied;
+		sg_set_page(sg, pages, copied, start);
+		rm->data.op_nents++;
+		sg++;
+	}
+	WARN_ON_ONCE(length != 0);
+	return ret;
 err:
-		consume_skb(skb);
-		rm->data.op_mmp_znotifier = NULL;
-		return ret;
-	} /* zcopy */
+	consume_skb(skb);
+	rm->data.op_mmp_znotifier = NULL;
+	return ret;
+}
+
+int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
+			       bool zcopy)
+{
+	unsigned long to_copy, nbytes;
+	unsigned long sg_off;
+	struct scatterlist *sg;
+	int ret = 0;
+
+	rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
+
+	/* now allocate and copy in the data payload.  */
+	sg = rm->data.op_sg;
+	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
+
+	if (zcopy)
+		return rds_message_zcopy_from_user(rm, from);
 
 	while (iov_iter_count(from)) {
 		if (!sg_page(sg)) {
-- 
cgit v1.2.3


From 9426bbc6de99b8649d897b94e8f5916b58195643 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Tue, 6 Mar 2018 07:22:34 -0800
Subject: rds: use list structure to track information for zerocopy completion
 notification

Commit 401910db4cd4 ("rds: deliver zerocopy completion notification
with data") removes support fo r zerocopy completion notification
on the sk_error_queue, thus we no longer need to track the cookie
information in sk_buff structures.

This commit removes the struct sk_buff_head rs_zcookie_queue by
a simpler list that results in a smaller memory footprint as well
as more efficient memory_allocation time.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/af_rds.c  |  6 ++---
 net/rds/message.c | 77 +++++++++++++++++++++++++++++++++++--------------------
 net/rds/rds.h     | 23 ++++++++++++-----
 net/rds/recv.c    | 23 ++++++++++++-----
 4 files changed, 85 insertions(+), 44 deletions(-)

(limited to 'net')

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index f7126108a811..ab751a150f70 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -77,7 +77,7 @@ static int rds_release(struct socket *sock)
 	rds_send_drop_to(rs, NULL);
 	rds_rdma_drop_keys(rs);
 	rds_notify_queue_get(rs, NULL);
-	__skb_queue_purge(&rs->rs_zcookie_queue);
+	rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);
 
 	spin_lock_bh(&rds_sock_lock);
 	list_del_init(&rs->rs_item);
@@ -180,7 +180,7 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
 	}
 	if (!list_empty(&rs->rs_recv_queue) ||
 	    !list_empty(&rs->rs_notify_queue) ||
-	    !skb_queue_empty(&rs->rs_zcookie_queue))
+	    !list_empty(&rs->rs_zcookie_queue.zcookie_head))
 		mask |= (EPOLLIN | EPOLLRDNORM);
 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 		mask |= (EPOLLOUT | EPOLLWRNORM);
@@ -515,7 +515,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	INIT_LIST_HEAD(&rs->rs_recv_queue);
 	INIT_LIST_HEAD(&rs->rs_notify_queue);
 	INIT_LIST_HEAD(&rs->rs_cong_list);
-	skb_queue_head_init(&rs->rs_zcookie_queue);
+	rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
 	rs->rs_rx_traces = 0;
diff --git a/net/rds/message.c b/net/rds/message.c
index c36edbb92c34..90dcdcfe9f62 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -48,7 +48,6 @@ static unsigned int	rds_exthdr_size[__RDS_EXTHDR_MAX] = {
 [RDS_EXTHDR_GEN_NUM]	= sizeof(u32),
 };
 
-
 void rds_message_addref(struct rds_message *rm)
 {
 	rdsdebug("addref rm %p ref %d\n", rm, refcount_read(&rm->m_refcount));
@@ -56,9 +55,9 @@ void rds_message_addref(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_addref);
 
-static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
+static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie)
 {
-	struct rds_zcopy_cookies *ck = (struct rds_zcopy_cookies *)skb->cb;
+	struct rds_zcopy_cookies *ck = &info->zcookies;
 	int ncookies = ck->num;
 
 	if (ncookies == RDS_MAX_ZCOOKIES)
@@ -68,38 +67,61 @@ static inline bool skb_zcookie_add(struct sk_buff *skb, u32 cookie)
 	return true;
 }
 
+struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif)
+{
+	return container_of(znotif, struct rds_msg_zcopy_info, znotif);
+}
+
+void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *q)
+{
+	unsigned long flags;
+	LIST_HEAD(copy);
+	struct rds_msg_zcopy_info *info, *tmp;
+
+	spin_lock_irqsave(&q->lock, flags);
+	list_splice(&q->zcookie_head, &copy);
+	INIT_LIST_HEAD(&q->zcookie_head);
+	spin_unlock_irqrestore(&q->lock, flags);
+
+	list_for_each_entry_safe(info, tmp, &copy, rs_zcookie_next) {
+		list_del(&info->rs_zcookie_next);
+		kfree(info);
+	}
+}
+
 static void rds_rm_zerocopy_callback(struct rds_sock *rs,
 				     struct rds_znotifier *znotif)
 {
-	struct sk_buff *skb, *tail;
-	unsigned long flags;
-	struct sk_buff_head *q;
+	struct rds_msg_zcopy_info *info;
+	struct rds_msg_zcopy_queue *q;
 	u32 cookie = znotif->z_cookie;
 	struct rds_zcopy_cookies *ck;
+	struct list_head *head;
+	unsigned long flags;
 
+	mm_unaccount_pinned_pages(&znotif->z_mmp);
 	q = &rs->rs_zcookie_queue;
 	spin_lock_irqsave(&q->lock, flags);
-	tail = skb_peek_tail(q);
-
-	if (tail && skb_zcookie_add(tail, cookie)) {
-		spin_unlock_irqrestore(&q->lock, flags);
-		mm_unaccount_pinned_pages(&znotif->z_mmp);
-		consume_skb(rds_skb_from_znotifier(znotif));
-		/* caller invokes rds_wake_sk_sleep() */
-		return;
+	head = &q->zcookie_head;
+	if (!list_empty(head)) {
+		info = list_entry(head, struct rds_msg_zcopy_info,
+				  rs_zcookie_next);
+		if (info && rds_zcookie_add(info, cookie)) {
+			spin_unlock_irqrestore(&q->lock, flags);
+			kfree(rds_info_from_znotifier(znotif));
+			/* caller invokes rds_wake_sk_sleep() */
+			return;
+		}
 	}
 
-	skb = rds_skb_from_znotifier(znotif);
-	ck = (struct rds_zcopy_cookies *)skb->cb;
+	info = rds_info_from_znotifier(znotif);
+	ck = &info->zcookies;
 	memset(ck, 0, sizeof(*ck));
-	WARN_ON(!skb_zcookie_add(skb, cookie));
-
-	__skb_queue_tail(q, skb);
+	WARN_ON(!rds_zcookie_add(info, cookie));
+	list_add_tail(&q->zcookie_head, &info->rs_zcookie_next);
 
 	spin_unlock_irqrestore(&q->lock, flags);
 	/* caller invokes rds_wake_sk_sleep() */
-
-	mm_unaccount_pinned_pages(&znotif->z_mmp);
 }
 
 /*
@@ -340,7 +362,7 @@ int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
 	int ret = 0;
 	int length = iov_iter_count(from);
 	int total_copied = 0;
-	struct sk_buff *skb;
+	struct rds_msg_zcopy_info *info;
 
 	rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from));
 
@@ -350,12 +372,11 @@ int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
 	sg = rm->data.op_sg;
 	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
-	skb = alloc_skb(0, GFP_KERNEL);
-	if (!skb)
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
 		return -ENOMEM;
-	BUILD_BUG_ON(sizeof(skb->cb) < max_t(int, sizeof(struct rds_znotifier),
-					     sizeof(struct rds_zcopy_cookies)));
-	rm->data.op_mmp_znotifier = RDS_ZCOPY_SKB(skb);
+	INIT_LIST_HEAD(&info->rs_zcookie_next);
+	rm->data.op_mmp_znotifier = &info->znotif;
 	if (mm_account_pinned_pages(&rm->data.op_mmp_znotifier->z_mmp,
 				    length)) {
 		ret = -ENOMEM;
@@ -389,7 +410,7 @@ int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
 	WARN_ON_ONCE(length != 0);
 	return ret;
 err:
-	consume_skb(skb);
+	kfree(info);
 	rm->data.op_mmp_znotifier = NULL;
 	return ret;
 }
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 33b16353d8f3..74cd27c661de 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -357,16 +357,27 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
 #define RDS_MSG_FLUSH		8
 
 struct rds_znotifier {
-	struct list_head	z_list;
 	struct mmpin		z_mmp;
 	u32			z_cookie;
 };
 
-#define	RDS_ZCOPY_SKB(__skb)	((struct rds_znotifier *)&((__skb)->cb[0]))
+struct rds_msg_zcopy_info {
+	struct list_head rs_zcookie_next;
+	union {
+		struct rds_znotifier znotif;
+		struct rds_zcopy_cookies zcookies;
+	};
+};
 
-static inline struct sk_buff *rds_skb_from_znotifier(struct rds_znotifier *z)
+struct rds_msg_zcopy_queue {
+	struct list_head zcookie_head;
+	spinlock_t lock; /* protects zcookie_head queue */
+};
+
+static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q)
 {
-	return container_of((void *)z, struct sk_buff, cb);
+	spin_lock_init(&q->lock);
+	INIT_LIST_HEAD(&q->zcookie_head);
 }
 
 struct rds_message {
@@ -603,8 +614,7 @@ struct rds_sock {
 	/* Socket receive path trace points*/
 	u8			rs_rx_traces;
 	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
-
-	struct sk_buff_head	rs_zcookie_queue;
+	struct rds_msg_zcopy_queue rs_zcookie_queue;
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
@@ -803,6 +813,7 @@ void rds_message_addref(struct rds_message *rm);
 void rds_message_put(struct rds_message *rm);
 void rds_message_wait(struct rds_message *rm);
 void rds_message_unmapped(struct rds_message *rm);
+void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info);
 
 static inline void rds_message_make_checksum(struct rds_header *hdr)
 {
diff --git a/net/rds/recv.c b/net/rds/recv.c
index d50747725221..de50e2126e40 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -579,9 +579,10 @@ out:
 
 static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
 {
-	struct sk_buff *skb;
-	struct sk_buff_head *q = &rs->rs_zcookie_queue;
+	struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue;
+	struct rds_msg_zcopy_info *info = NULL;
 	struct rds_zcopy_cookies *done;
+	unsigned long flags;
 
 	if (!msg->msg_control)
 		return false;
@@ -590,16 +591,24 @@ static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg)
 	    msg->msg_controllen < CMSG_SPACE(sizeof(*done)))
 		return false;
 
-	skb = skb_dequeue(q);
-	if (!skb)
+	spin_lock_irqsave(&q->lock, flags);
+	if (!list_empty(&q->zcookie_head)) {
+		info = list_entry(q->zcookie_head.next,
+				  struct rds_msg_zcopy_info, rs_zcookie_next);
+		list_del(&info->rs_zcookie_next);
+	}
+	spin_unlock_irqrestore(&q->lock, flags);
+	if (!info)
 		return false;
-	done = (struct rds_zcopy_cookies *)skb->cb;
+	done = &info->zcookies;
 	if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done),
 		     done)) {
-		skb_queue_head(q, skb);
+		spin_lock_irqsave(&q->lock, flags);
+		list_add(&info->rs_zcookie_next, &q->zcookie_head);
+		spin_unlock_irqrestore(&q->lock, flags);
 		return false;
 	}
-	consume_skb(skb);
+	kfree(info);
 	return true;
 }
 
-- 
cgit v1.2.3


From a366e300ae9fc466d333e6d8f2bc5d58ed248041 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 7 Mar 2018 08:43:19 -0800
Subject: ip6mr: remove synchronize_rcu() in favor of SOCK_RCU_FREE

Kirill found that recently added synchronize_rcu() call in
ip6mr_sk_done()
was slowing down netns dismantle and posted a patch to use it only if
the socket
was found.

I instead suggested to get rid of this call, and use instead
SOCK_RCU_FREE

We might later change IPv4 side to use the same technique and unify
both stacks. IPv4 does not use synchronize_rcu() but has a call_rcu()
that could be replaced by SOCK_RCU_FREE.

Tested:
 time for i in {1..1000}; do unshare -n /bin/false;done

 Before : real 7m18.911s
 After : real 10.187s

Fixes: 8571ab479a6e ("ip6mr: Make mroute_sk rcu-based")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Yuval Mintz <yuvalm@mellanox.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 2a38f9de45d3..7345bd6c4b7d 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1443,6 +1443,7 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
 		err = -EADDRINUSE;
 	} else {
 		rcu_assign_pointer(mrt->mroute_sk, sk);
+		sock_set_flag(sk, SOCK_RCU_FREE);
 		net->ipv6.devconf_all->mc_forwarding++;
 	}
 	write_unlock_bh(&mrt_lock);
@@ -1472,6 +1473,10 @@ int ip6mr_sk_done(struct sock *sk)
 		if (sk == rtnl_dereference(mrt->mroute_sk)) {
 			write_lock_bh(&mrt_lock);
 			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
+			/* Note that mroute_sk had SOCK_RCU_FREE set,
+			 * so the RCU grace period before sk freeing
+			 * is guaranteed by sk_destruct()
+			 */
 			net->ipv6.devconf_all->mc_forwarding--;
 			write_unlock_bh(&mrt_lock);
 			inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1485,7 +1490,6 @@ int ip6mr_sk_done(struct sock *sk)
 		}
 	}
 	rtnl_unlock();
-	synchronize_rcu();
 
 	return err;
 }
-- 
cgit v1.2.3


From 67ae686b3e1422a819500f35152cdd74f6dab6ce Mon Sep 17 00:00:00 2001
From: Arkadi Sharshevsky <arkadis@mellanox.com>
Date: Thu, 8 Mar 2018 12:52:25 +0200
Subject: devlink: Change dpipe/resource get privileges

Let dpipe/resource be retrieved by unprivileged users.

Signed-off-by: Arkadi Sharshevsky <arkadis@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index 1b5bf0d1cee9..f23e5ed7c90f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -2744,22 +2744,22 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
 		.doit = devlink_nl_cmd_dpipe_table_get,
 		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
 	},
 	{
 		.cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
 		.doit = devlink_nl_cmd_dpipe_entries_get,
 		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
 	},
 	{
 		.cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
 		.doit = devlink_nl_cmd_dpipe_headers_get,
 		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
 	},
 	{
 		.cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
@@ -2779,8 +2779,8 @@ static const struct genl_ops devlink_nl_ops[] = {
 		.cmd = DEVLINK_CMD_RESOURCE_DUMP,
 		.doit = devlink_nl_cmd_resource_dump,
 		.policy = devlink_nl_policy,
-		.flags = GENL_ADMIN_PERM,
 		.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+		/* can be retrieved by unprivileged users */
 	},
 	{
 		.cmd = DEVLINK_CMD_RELOAD,
-- 
cgit v1.2.3


From 459d153d9916ea48b1550bbb6f2959dc03bff011 Mon Sep 17 00:00:00 2001
From: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Date: Tue, 6 Mar 2018 18:11:14 +0100
Subject: net/sched: cls_flower: Add support to handle first frag as match
 field

Allow setting firstfrag as matching option in tc flower classifier.

 # tc filter add dev eth0 protocol ip parent ffff: \
     flower indev eth0 \
        ip_flags firstfrag
     action mirred egress redirect dev eth1

Signed-off-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Signed-off-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_flower.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 7d0ce2c40f93..d964e60c730e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -511,6 +511,9 @@ static int fl_set_key_flags(struct nlattr **tb,
 
 	fl_set_key_flag(key, mask, flags_key, flags_mask,
 			TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+	fl_set_key_flag(key, mask, flags_key, flags_mask,
+			TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
+			FLOW_DIS_FIRST_FRAG);
 
 	return 0;
 }
@@ -1130,6 +1133,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
 
 	fl_get_key_flag(flags_key, flags_mask, &key, &mask,
 			TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT);
+	fl_get_key_flag(flags_key, flags_mask, &key, &mask,
+			TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
+			FLOW_DIS_FIRST_FRAG);
 
 	_key = cpu_to_be32(key);
 	_mask = cpu_to_be32(mask);
-- 
cgit v1.2.3


From 997266a4a02de882de11c7abecad0a3a42ac84b3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:39:06 +0300
Subject: net: Convert ip6 tables pernet_operations

The pernet_operations:

    ip6table_filter_net_ops
    ip6table_mangle_net_ops
    ip6table_nat_net_ops
    ip6table_raw_net_ops
    ip6table_security_net_ops

have exit methods, which call ip6t_unregister_table().
ip6table_filter_net_ops has init method registering
filter table.

Since there must not be in-flight ipv6 packets at the time
of pernet_operations execution and since pernet_operations
don't send ipv6 packets each other, these pernet_operations
are safe to be async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/ip6table_filter.c   | 1 +
 net/ipv6/netfilter/ip6table_mangle.c   | 1 +
 net/ipv6/netfilter/ip6table_nat.c      | 1 +
 net/ipv6/netfilter/ip6table_raw.c      | 1 +
 net/ipv6/netfilter/ip6table_security.c | 1 +
 5 files changed, 5 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 1343077dde93..06561c84c0bc 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -87,6 +87,7 @@ static void __net_exit ip6table_filter_net_exit(struct net *net)
 static struct pernet_operations ip6table_filter_net_ops = {
 	.init = ip6table_filter_net_init,
 	.exit = ip6table_filter_net_exit,
+	.async = true,
 };
 
 static int __init ip6table_filter_init(void)
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index b0524b18c4fb..a11e25936b45 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -107,6 +107,7 @@ static void __net_exit ip6table_mangle_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_mangle_net_ops = {
 	.exit = ip6table_mangle_net_exit,
+	.async = true,
 };
 
 static int __init ip6table_mangle_init(void)
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 47306e45a80a..4475fd300bb6 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -131,6 +131,7 @@ static void __net_exit ip6table_nat_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_nat_net_ops = {
 	.exit	= ip6table_nat_net_exit,
+	.async	= true,
 };
 
 static int __init ip6table_nat_init(void)
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 710fa0806c37..a88f3b1995b1 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -75,6 +75,7 @@ static void __net_exit ip6table_raw_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_raw_net_ops = {
 	.exit = ip6table_raw_net_exit,
+	.async = true,
 };
 
 static int __init ip6table_raw_init(void)
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index cf26ccb04056..320048c008dc 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -74,6 +74,7 @@ static void __net_exit ip6table_security_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_security_net_ops = {
 	.exit = ip6table_security_net_exit,
+	.async = true,
 };
 
 static int __init ip6table_security_init(void)
-- 
cgit v1.2.3


From 649b9826cc733fe410207d28d94984354e023b21 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:39:14 +0300
Subject: net: Convert xfrm_user_net_ops

These pernet_operations create and destroy net::xfrm::nlsk
socket of NETLINK_XFRM. There is only entry point, where
it's dereferenced, it's xfrm_user_rcv_msg(). There is no
in-kernel senders to this socket.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_user.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 7f52b8eb177d..aff2e84ec761 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -3258,6 +3258,7 @@ static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
 static struct pernet_operations xfrm_user_net_ops = {
 	.init	    = xfrm_user_net_init,
 	.exit_batch = xfrm_user_net_exit,
+	.async	    = true,
 };
 
 static int __init xfrm_user_init(void)
-- 
cgit v1.2.3


From c7c5e435e44e585f84472f88b323553c794844c4 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:39:23 +0300
Subject: net: Convert nf_tables_net_ops

These pernet_operations looks nicely separated per-net.
Exit method unregisters net's nf tables objects.
We allow them be executed in parallel.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_tables_api.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 558593e6a0a3..8e19c86d1aa6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6596,6 +6596,7 @@ static void __net_exit nf_tables_exit_net(struct net *net)
 static struct pernet_operations nf_tables_net_ops = {
 	.init	= nf_tables_init_net,
 	.exit	= nf_tables_exit_net,
+	.async	= true,
 };
 
 static int __init nf_tables_module_init(void)
-- 
cgit v1.2.3


From 5a8e9be69d163fcd3442c4ed8fbaaaf0c7f464e6 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:39:33 +0300
Subject: net: Convert nfnetlink_net_ops

These pernet_operations create and destroy net::nfnl
socket of NETLINK_NETFILTER code. There are no other
places, where such type the socket is created, except
these pernet_operations. It seem other pernet_operations
depending on CONFIG_NETFILTER_NETLINK send messages
to this socket. So, we mark it async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nfnetlink.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 03ead8a9e90c..84fc4954862d 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -566,6 +566,7 @@ static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
 static struct pernet_operations nfnetlink_net_ops = {
 	.init		= nfnetlink_net_init,
 	.exit_batch	= nfnetlink_net_exit_batch,
+	.async		= true,
 };
 
 static int __init nfnetlink_init(void)
-- 
cgit v1.2.3


From cf51503a03f7ff89d8152a3132843330be90729e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:39:42 +0300
Subject: net: Convert nfnl_acct_ops

These pernet_operations look closed in themself,
and there are no other users of net::nfnl_acct_list
outside. They are safe to be executed for several
net namespaces in parallel.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nfnetlink_acct.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 88d427f9f9e6..8d9f18bb8840 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -515,6 +515,7 @@ static void __net_exit nfnl_acct_net_exit(struct net *net)
 static struct pernet_operations nfnl_acct_ops = {
         .init   = nfnl_acct_net_init,
         .exit   = nfnl_acct_net_exit,
+	.async	= true,
 };
 
 static int __init nfnl_acct_init(void)
-- 
cgit v1.2.3


From ffdf72bc1eed9a56aa266bde2d425f85b4d4ca18 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:39:51 +0300
Subject: net: Convert cttimeout_ops

These pernet_operations also look closed in themself.
Exit method touch only per-net structures, so it's
safe to execute them for several net namespaces in parallel.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nfnetlink_cttimeout.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 95b04702a655..6819300f7fb7 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -586,6 +586,7 @@ static void __net_exit cttimeout_net_exit(struct net *net)
 static struct pernet_operations cttimeout_ops = {
 	.init	= cttimeout_net_init,
 	.exit	= cttimeout_net_exit,
+	.async	= true,
 };
 
 static int __init cttimeout_init(void)
-- 
cgit v1.2.3


From 74f26bbf505a8e1bb9a6ad9726d5bbe625607783 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:40:00 +0300
Subject: net: Convert nfnl_log_net_ops

These pernet_operations create and destroy /proc entries.
Also, exit method unsets nfulnl_logger. The logger is not
set by default, and it becomes bound via userspace request.
So, they look safe to be made async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nfnetlink_log.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 7b46aa4c478d..b21ef79849a1 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1108,6 +1108,7 @@ static struct pernet_operations nfnl_log_net_ops = {
 	.exit	= nfnl_log_net_exit,
 	.id	= &nfnl_log_net_id,
 	.size	= sizeof(struct nfnl_log_net),
+	.async	= true,
 };
 
 static int __init nfnetlink_log_init(void)
-- 
cgit v1.2.3


From bd54dce0796522b669f5be45b96e02fa94738de9 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:40:09 +0300
Subject: net: Convert nfnl_queue_net_ops

These pernet_operations register and unregister net::nf::queue_handler
and /proc entry. The handler is accessed only under RCU, so this looks
safe to convert them.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nfnetlink_queue.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 8bba23160a68..59e2833c17f1 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1528,6 +1528,7 @@ static struct pernet_operations nfnl_queue_net_ops = {
 	.exit_batch	= nfnl_queue_net_exit_batch,
 	.id		= &nfnl_queue_net_id,
 	.size		= sizeof(struct nfnl_queue_net),
+	.async		= true,
 };
 
 static int __init nfnetlink_queue_init(void)
-- 
cgit v1.2.3


From 59d269731e2bcfb72d29f2e0281d6631aef1ff8e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:40:19 +0300
Subject: net: Convert pg_net_ops

These pernet_operations create per-net pktgen threads
and /proc entries. These pernet subsys looks closed
in itself, and there are no pernet_operations outside
this file, which are interested in the threads.
Init and/or exit methods look safe to be executed
in parallel.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index b8ab5c829511..d81bddd1bb80 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3851,6 +3851,7 @@ static struct pernet_operations pg_net_ops = {
 	.exit = pg_net_exit,
 	.id   = &pg_net_id,
 	.size = sizeof(struct pktgen_net),
+	.async = true,
 };
 
 static int __init pg_init(void)
-- 
cgit v1.2.3


From 93623f2b00297f0eb4437fc2e47ad0d4fd58a3ad Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:40:28 +0300
Subject: net: Convert arptable_filter_net_ops

These pernet_operations unregister net::ipv4::arptable_filter.
Another net/pernet_operations do not send arp packets to foreign
net namespaces. So, we mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/arptable_filter.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 8f8713b4388f..49c2490193ae 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -65,6 +65,7 @@ static void __net_exit arptable_filter_net_exit(struct net *net)
 
 static struct pernet_operations arptable_filter_net_ops = {
 	.exit = arptable_filter_net_exit,
+	.async = true,
 };
 
 static int __init arptable_filter_init(void)
-- 
cgit v1.2.3


From 7ba81869d1f6f985e64031e161b4c009cc15be99 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:40:36 +0300
Subject: net: Convert iptable_mangle_net_ops

These pernet_operations unregister net::ipv4::iptable_mangle table.
Another net/pernet_operations do not send ipv4 packets to foreign
net namespaces. So, we mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/iptable_mangle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index dea138ca8925..f6074059531a 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -113,6 +113,7 @@ static void __net_exit iptable_mangle_net_exit(struct net *net)
 
 static struct pernet_operations iptable_mangle_net_ops = {
 	.exit = iptable_mangle_net_exit,
+	.async = true,
 };
 
 static int __init iptable_mangle_init(void)
-- 
cgit v1.2.3


From 06a8a67b5dac661a0f0b865926ceab2e62d512cd Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:40:45 +0300
Subject: net: Convert iptable_nat_net_ops

These pernet_operations unregister net::ipv4::nat_table table.
Another net/pernet_operations do not send ipv4 packets to foreign
net namespaces. So, we mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/iptable_nat.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index 0f7255cc65ee..b771af74be79 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -129,6 +129,7 @@ static void __net_exit iptable_nat_net_exit(struct net *net)
 
 static struct pernet_operations iptable_nat_net_ops = {
 	.exit	= iptable_nat_net_exit,
+	.async	= true,
 };
 
 static int __init iptable_nat_init(void)
-- 
cgit v1.2.3


From 65f828c35261c46d848e9d789ccf29c2fe7127a8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:40:58 +0300
Subject: net: Convert iptable_raw_net_ops

These pernet_operations unregister net::ipv4::iptable_raw table.
Another net/pernet_operations do not send ipv4 packets to foreign
net namespaces. So, we mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/iptable_raw.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 960625aabf04..963753e50842 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -76,6 +76,7 @@ static void __net_exit iptable_raw_net_exit(struct net *net)
 
 static struct pernet_operations iptable_raw_net_ops = {
 	.exit = iptable_raw_net_exit,
+	.async = true,
 };
 
 static int __init iptable_raw_init(void)
-- 
cgit v1.2.3


From 8dbc6e2eaeccccecd23888bd14c10a2c384c280f Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:41:07 +0300
Subject: net: Convert iptable_security_net_ops

These pernet_operations unregister net::ipv4::iptable_security table.
Another net/pernet_operations do not send ipv4 packets to foreign
net namespaces. So, we mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/iptable_security.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index e5379fe57b64..c40d6b3d8b6a 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -76,6 +76,7 @@ static void __net_exit iptable_security_net_exit(struct net *net)
 
 static struct pernet_operations iptable_security_net_ops = {
 	.exit = iptable_security_net_exit,
+	.async = true,
 };
 
 static int __init iptable_security_init(void)
-- 
cgit v1.2.3


From e8a95ad46378162f22c9293bde276d7c4030f31e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:41:16 +0300
Subject: net: Convert ipv4_net_ops

These pernet_operations register and unregister bunch
of nf_conntrack_l4proto. Exit method unregisters related
sysctl, init method calls init_net and get_net_proto.
The whole builtin_l4proto4 array has pretty simple
init_net and get_net_proto methods. The first one register
sysctl table, the second one is just RO memory dereference.
So, these pernet_operations are safe to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index b50721d9d30e..6531f69db010 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -399,6 +399,7 @@ static struct pernet_operations ipv4_net_ops = {
 	.exit = ipv4_net_exit,
 	.id = &conntrack4_net_id,
 	.size = sizeof(struct conntrack4_net),
+	.async = true,
 };
 
 static int __init nf_conntrack_l3proto_ipv4_init(void)
-- 
cgit v1.2.3


From 1fd2c55705aebac31bf8f759c0750dc3c796cf47 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 7 Mar 2018 12:41:23 +0300
Subject: net: Convet ipv6_net_ops

These pernet_operations are similar to ipv4_net_ops.
They are safe to be async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 663827ee3cf8..ba54bb3bd1e4 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -401,6 +401,7 @@ static struct pernet_operations ipv6_net_ops = {
 	.exit = ipv6_net_exit,
 	.id = &conntrack6_net_id,
 	.size = sizeof(struct conntrack6_net),
+	.async = true,
 };
 
 static int __init nf_conntrack_l3proto_ipv6_init(void)
-- 
cgit v1.2.3


From 46e371f0e78a82186a83cbcb4f4b8850417c7dd5 Mon Sep 17 00:00:00 2001
From: William Tu <u9012063@gmail.com>
Date: Wed, 7 Mar 2018 15:38:48 -0800
Subject: openvswitch: fix vport packet length check.

When sending a packet to a tunnel device, the dev's hard_header_len
could be larger than the skb->len in function packet_length().
In the case of ip6gretap/erspan, hard_header_len = LL_MAX_HEADER + t_hlen,
which is around 180, and an ARP packet sent to this tunnel has
skb->len = 42.  This causes the 'unsign int length' to become super
large because it is negative value, causing the later ovs_vport_send
to drop it due to over-mtu size.  The patch fixes it by setting it to 0.

Signed-off-by: William Tu <u9012063@gmail.com>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/vport.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index b6c8524032a0..f81c1d0ddff4 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -464,10 +464,10 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	return 0;
 }
 
-static unsigned int packet_length(const struct sk_buff *skb,
-				  struct net_device *dev)
+static int packet_length(const struct sk_buff *skb,
+			 struct net_device *dev)
 {
-	unsigned int length = skb->len - dev->hard_header_len;
+	int length = skb->len - dev->hard_header_len;
 
 	if (!skb_vlan_tag_present(skb) &&
 	    eth_type_vlan(skb->protocol))
@@ -478,7 +478,7 @@ static unsigned int packet_length(const struct sk_buff *skb,
 	 * account for 802.1ad. e.g. is_skb_forwardable().
 	 */
 
-	return length;
+	return length > 0 ? length : 0;
 }
 
 void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
-- 
cgit v1.2.3


From 50db64b090d0f457a3266fe80e2c841eba621b9d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 8 Mar 2018 12:36:04 +0300
Subject: net/ncsi: use kfree_skb() instead of kfree()

We're supposed to use kfree_skb() to free these sk_buffs.

Fixes: 955dc68cb9b2 ("net/ncsi: Add generic netlink family")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index d4201665a580..b73239b76349 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -183,7 +183,7 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info)
 	hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
 			  &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO);
 	if (!hdr) {
-		kfree(skb);
+		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
 
@@ -204,7 +204,7 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info)
 
 err:
 	genlmsg_cancel(skb, hdr);
-	kfree(skb);
+	kfree_skb(skb);
 	return rc;
 }
 
-- 
cgit v1.2.3


From 054f34da60dfa46457ca39dd111bb2b393575dab Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 8 Mar 2018 12:36:28 +0300
Subject: net/ncsi: unlock on error in ncsi_set_interface_nl()

There are two error paths which are missing unlocks in this function.

Fixes: 955dc68cb9b2 ("net/ncsi: Add generic netlink family")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-netlink.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index b73239b76349..05fcfb4fbe1d 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -299,6 +299,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
 			package = np;
 	if (!package) {
 		/* The user has set a package that does not exist */
+		spin_unlock_irqrestore(&ndp->lock, flags);
 		return -ERANGE;
 	}
 
@@ -317,6 +318,7 @@ static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info)
 		/* The user has set a channel that does not exist on this
 		 * package
 		 */
+		spin_unlock_irqrestore(&ndp->lock, flags);
 		netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n",
 			    channel_id);
 		return -ERANGE;
-- 
cgit v1.2.3


From 496c7f3caed5c56ba4ab6767a9cf7e7aa8bd8f41 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Thu, 8 Mar 2018 18:56:14 +0800
Subject: rds: rds_message_zcopy_from_user() can be static

Fixes: d40a126b16ea ("rds: refactor zcopy code into rds_message_zcopy_from_user")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/message.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 90dcdcfe9f62..3c610d5fe7ae 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -355,7 +355,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 	return rm;
 }
 
-int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
+static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
 {
 	unsigned long sg_off;
 	struct scatterlist *sg;
-- 
cgit v1.2.3


From 571e6776add4f499661e761e03e46ec0f6d66243 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Thu, 8 Mar 2018 19:37:30 +0800
Subject: rds: rds_info_from_znotifier() can be static

Fixes: 9426bbc6de99 ("rds: use list structure to track information for zerocopy completion notification")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/message.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 3c610d5fe7ae..4462e4c9bd7d 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -67,7 +67,7 @@ static inline bool rds_zcookie_add(struct rds_msg_zcopy_info *info, u32 cookie)
 	return true;
 }
 
-struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif)
+static struct rds_msg_zcopy_info *rds_info_from_znotifier(struct rds_znotifier *znotif)
 {
 	return container_of(znotif, struct rds_msg_zcopy_info, znotif);
 }
-- 
cgit v1.2.3


From 84a1d9c4820080bebcbd413a845076dcb62f45fa Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Thu, 8 Mar 2018 15:45:03 +0000
Subject: net: ethtool: extend RXNFC API to support RSS spreading of filter
 matches

We use a two-step process to configure a filter with RSS spreading.  First,
 the RSS context is allocated and configured using ETHTOOL_SRSSH; this
 returns an identifier (rss_context) which can then be passed to subsequent
 invocations of ETHTOOL_SRXCLSRLINS to specify that the offset from the RSS
 indirection table lookup should be added to the queue number (ring_cookie)
 when delivering the packet.  Drivers for devices which can only use the
 indirection table entry directly (not add it to a base queue number)
 should reject rule insertions combining RSS with a nonzero ring_cookie.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 64 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 15 deletions(-)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3f89c76d5c24..157cd9efa4be 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1022,6 +1022,15 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
 	if (copy_from_user(&info, useraddr, info_size))
 		return -EFAULT;
 
+	/* If FLOW_RSS was requested then user-space must be using the
+	 * new definition, as FLOW_RSS is newer.
+	 */
+	if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
+		info_size = sizeof(info);
+		if (copy_from_user(&info, useraddr, info_size))
+			return -EFAULT;
+	}
+
 	if (info.cmd == ETHTOOL_GRXCLSRLALL) {
 		if (info.rule_cnt > 0) {
 			if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
@@ -1251,9 +1260,11 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	user_key_size = rxfh.key_size;
 
 	/* Check that reserved fields are 0 for now */
-	if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
-	    rxfh.rsvd8[2] || rxfh.rsvd32)
+	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
 		return -EINVAL;
+	/* Most drivers don't handle rss_context, check it's 0 as well */
+	if (rxfh.rss_context && !ops->get_rxfh_context)
+		return -EOPNOTSUPP;
 
 	rxfh.indir_size = dev_indir_size;
 	rxfh.key_size = dev_key_size;
@@ -1276,7 +1287,12 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
 	if (user_key_size)
 		hkey = rss_config + indir_bytes;
 
-	ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
+	if (rxfh.rss_context)
+		ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey,
+							 &dev_hfunc,
+							 rxfh.rss_context);
+	else
+		ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
 	if (ret)
 		goto out;
 
@@ -1306,6 +1322,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	u8 *hkey = NULL;
 	u8 *rss_config;
 	u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
+	bool delete = false;
 
 	if (!ops->get_rxnfc || !ops->set_rxfh)
 		return -EOPNOTSUPP;
@@ -1319,9 +1336,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		return -EFAULT;
 
 	/* Check that reserved fields are 0 for now */
-	if (rxfh.rss_context || rxfh.rsvd8[0] || rxfh.rsvd8[1] ||
-	    rxfh.rsvd8[2] || rxfh.rsvd32)
+	if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
 		return -EINVAL;
+	/* Most drivers don't handle rss_context, check it's 0 as well */
+	if (rxfh.rss_context && !ops->set_rxfh_context)
+		return -EOPNOTSUPP;
 
 	/* If either indir, hash key or function is valid, proceed further.
 	 * Must request at least one change: indir size, hash key or function.
@@ -1346,7 +1365,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 	if (ret)
 		goto out;
 
-	/* rxfh.indir_size == 0 means reset the indir table to default.
+	/* rxfh.indir_size == 0 means reset the indir table to default (master
+	 * context) or delete the context (other RSS contexts).
 	 * rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE means leave it unchanged.
 	 */
 	if (rxfh.indir_size &&
@@ -1359,9 +1379,13 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		if (ret)
 			goto out;
 	} else if (rxfh.indir_size == 0) {
-		indir = (u32 *)rss_config;
-		for (i = 0; i < dev_indir_size; i++)
-			indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+		if (rxfh.rss_context == 0) {
+			indir = (u32 *)rss_config;
+			for (i = 0; i < dev_indir_size; i++)
+				indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+		} else {
+			delete = true;
+		}
 	}
 
 	if (rxfh.key_size) {
@@ -1374,15 +1398,25 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
 		}
 	}
 
-	ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+	if (rxfh.rss_context)
+		ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc,
+					    &rxfh.rss_context, delete);
+	else
+		ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
 	if (ret)
 		goto out;
 
-	/* indicate whether rxfh was set to default */
-	if (rxfh.indir_size == 0)
-		dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
-	else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
-		dev->priv_flags |= IFF_RXFH_CONFIGURED;
+	if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
+			 &rxfh.rss_context, sizeof(rxfh.rss_context)))
+		ret = -EFAULT;
+
+	if (!rxfh.rss_context) {
+		/* indicate whether rxfh was set to default */
+		if (rxfh.indir_size == 0)
+			dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+		else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+			dev->priv_flags |= IFF_RXFH_CONFIGURED;
+	}
 
 out:
 	kfree(rss_config);
-- 
cgit v1.2.3


From 79134e6ce2c9d1a00eab4d98cb48f975dd2474cb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 8 Mar 2018 12:51:41 -0800
Subject: net: do not create fallback tunnels for non-default namespaces

fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.

These tunnels are also automatically created when a new network
namespace is created, at a great cost.

In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)

Add a new sysctl so that we can opt-out from this automatic creation.

Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.

Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
 (for j in `seq 1 100` ; do  unshare -n /bin/true >/dev/null ; done) &
done
wait

lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real	0m37.521s
user	0m0.886s
sys	7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh

real	0m4.761s
user	0m0.851s
sys	1m8.343s
lpk43:~#

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sysctl_net_core.c | 12 ++++++++++++
 net/ipv4/ip_tunnel.c       | 20 ++++++++++++--------
 net/ipv6/ip6_gre.c         |  4 +++-
 net/ipv6/ip6_tunnel.c      |  2 ++
 net/ipv6/sit.c             |  5 ++++-
 5 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index d714f65782b7..4f47f92459cc 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -32,6 +32,9 @@ static int max_skb_frags = MAX_SKB_FRAGS;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
+int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+
 #ifdef CONFIG_RPS
 static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 				void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -513,6 +516,15 @@ static struct ctl_table net_core_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "fb_tunnels_only_for_init_net",
+		.data		= &sysctl_fb_tunnels_only_for_init_net,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 602597dfc395..5fcb17cb426b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -347,8 +347,7 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
 	struct net_device *dev;
 	int t_hlen;
 
-	BUG_ON(!itn->fb_tunnel_dev);
-	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
 	if (IS_ERR(dev))
 		return ERR_CAST(dev);
 
@@ -822,7 +821,6 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 	struct net *net = t->net;
 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
 
-	BUG_ON(!itn->fb_tunnel_dev);
 	switch (cmd) {
 	case SIOCGETTUNNEL:
 		if (dev == itn->fb_tunnel_dev) {
@@ -847,7 +845,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
 				p->o_key = 0;
 		}
 
-		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+		t = ip_tunnel_find(itn, p, itn->type);
 
 		if (cmd == SIOCADDTUNNEL) {
 			if (!t) {
@@ -991,10 +989,15 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 	struct ip_tunnel_parm parms;
 	unsigned int i;
 
+	itn->rtnl_link_ops = ops;
 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
 		INIT_HLIST_HEAD(&itn->tunnels[i]);
 
-	if (!ops) {
+	if (!ops || !net_has_fallback_tunnels(net)) {
+		struct ip_tunnel_net *it_init_net;
+
+		it_init_net = net_generic(&init_net, ip_tnl_net_id);
+		itn->type = it_init_net->type;
 		itn->fb_tunnel_dev = NULL;
 		return 0;
 	}
@@ -1012,6 +1015,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+		itn->type = itn->fb_tunnel_dev->type;
 	}
 	rtnl_unlock();
 
@@ -1019,10 +1023,10 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
 
-static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
+			      struct list_head *head,
 			      struct rtnl_link_ops *ops)
 {
-	struct net *net = dev_net(itn->fb_tunnel_dev);
 	struct net_device *dev, *aux;
 	int h;
 
@@ -1054,7 +1058,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
 	rtnl_lock();
 	list_for_each_entry(net, net_list, exit_list) {
 		itn = net_generic(net, id);
-		ip_tunnel_destroy(itn, &list, ops);
+		ip_tunnel_destroy(net, itn, &list, ops);
 	}
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 18a3dfbd0300..7d8775c9570d 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -236,7 +236,7 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
 		return t;
 
 	dev = ign->fb_tunnel_dev;
-	if (dev->flags & IFF_UP)
+	if (dev && dev->flags & IFF_UP)
 		return netdev_priv(dev);
 
 	return NULL;
@@ -1472,6 +1472,8 @@ static int __net_init ip6gre_init_net(struct net *net)
 	struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
 	int err;
 
+	if (!net_has_fallback_tunnels(net))
+		return 0;
 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
 					  NET_NAME_UNKNOWN,
 					  ip6gre_tunnel_setup);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 56c4967f1868..5c045fa407da 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -2205,6 +2205,8 @@ static int __net_init ip6_tnl_init_net(struct net *net)
 	ip6n->tnls[0] = ip6n->tnls_wc;
 	ip6n->tnls[1] = ip6n->tnls_r_l;
 
+	if (!net_has_fallback_tunnels(net))
+		return 0;
 	err = -ENOMEM;
 	ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
 					NET_NAME_UNKNOWN, ip6_tnl_dev_setup);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a9c4ac6efe22..8a4f8fddd812 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
 #ifdef CONFIG_IPV6_SIT_6RD
 	struct ip_tunnel *t = netdev_priv(dev);
 
-	if (dev == sitn->fb_tunnel_dev) {
+	if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) {
 		ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
 		t->ip6rd.relay_prefix = 0;
 		t->ip6rd.prefixlen = 16;
@@ -1835,6 +1835,9 @@ static int __net_init sit_init_net(struct net *net)
 	sitn->tunnels[2] = sitn->tunnels_r;
 	sitn->tunnels[3] = sitn->tunnels_r_l;
 
+	if (!net_has_fallback_tunnels(net))
+		return 0;
+
 	sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
 					   NET_NAME_UNKNOWN,
 					   ipip6_tunnel_setup);
-- 
cgit v1.2.3


From d04e6990c948a3315ea8eca5979ebea48cda56f4 Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Thu, 8 Mar 2018 16:59:17 -0500
Subject: net sched actions: update Add/Delete action API with new argument

Introduce a new function argument to carry total attributes size for
correct allocation of skb in event messages.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 21 +++++++++++++--------
 net/sched/cls_api.c |  3 ++-
 2 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index a54fa7b8c217..3de0e0610200 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -741,7 +741,8 @@ static void cleanup_a(struct list_head *actions, int ovr)
 
 int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 		    struct nlattr *est, char *name, int ovr, int bind,
-		    struct list_head *actions, struct netlink_ext_ack *extack)
+		    struct list_head *actions, size_t *attr_size,
+		    struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
@@ -994,12 +995,13 @@ err_out:
 
 static int
 tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
-	       u32 portid, struct netlink_ext_ack *extack)
+	       u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
 {
 	int ret;
 	struct sk_buff *skb;
 
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
+			GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
@@ -1032,6 +1034,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 	int i, ret;
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
+	size_t attr_size = 0;
 	LIST_HEAD(actions);
 
 	ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, extack);
@@ -1059,7 +1062,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 	if (event == RTM_GETACTION)
 		ret = tcf_get_notify(net, portid, n, &actions, event, extack);
 	else { /* delete */
-		ret = tcf_del_notify(net, n, &actions, portid, extack);
+		ret = tcf_del_notify(net, n, &actions, portid, attr_size, extack);
 		if (ret)
 			goto err;
 		return ret;
@@ -1072,12 +1075,13 @@ err:
 
 static int
 tcf_add_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
-	       u32 portid, struct netlink_ext_ack *extack)
+	       u32 portid, size_t attr_size, struct netlink_ext_ack *extack)
 {
 	struct sk_buff *skb;
 	int err = 0;
 
-	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size,
+			GFP_KERNEL);
 	if (!skb)
 		return -ENOBUFS;
 
@@ -1099,15 +1103,16 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
 			  struct nlmsghdr *n, u32 portid, int ovr,
 			  struct netlink_ext_ack *extack)
 {
+	size_t attr_size = 0;
 	int ret = 0;
 	LIST_HEAD(actions);
 
 	ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, &actions,
-			      extack);
+			      &attr_size, extack);
 	if (ret)
 		return ret;
 
-	return tcf_add_notify(net, n, &actions, portid, extack);
+	return tcf_add_notify(net, n, &actions, portid, attr_size, extack);
 }
 
 static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 19f9f421d5b7..ec5fe8ec0c3e 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1433,6 +1433,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 #ifdef CONFIG_NET_CLS_ACT
 	{
 		struct tc_action *act;
+		size_t attr_size = 0;
 
 		if (exts->police && tb[exts->police]) {
 			act = tcf_action_init_1(net, tp, tb[exts->police],
@@ -1450,7 +1451,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb,
 
 			err = tcf_action_init(net, tp, tb[exts->action],
 					      rate_tlv, NULL, ovr, TCA_ACT_BIND,
-					      &actions, extack);
+					      &actions, &attr_size, extack);
 			if (err)
 				return err;
 			list_for_each_entry(act, &actions, list)
-- 
cgit v1.2.3


From 4e76e75d6aba83f35d55b50f66a6768af1657563 Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Thu, 8 Mar 2018 16:59:19 -0500
Subject: net sched actions: calculate add/delete event message size

Introduce routines to calculate size of the shared tc netlink attributes
and the full message size including netlink header and tc service header.

Update add/delete action logic to have the size for event messages,
the size is passed to tcf_add_notify() and tcf_del_notify() where the
notification message is being allocated and constructed.

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 3de0e0610200..57cf37145282 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -109,6 +109,42 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict)
 }
 EXPORT_SYMBOL(__tcf_idr_release);
 
+static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
+{
+	u32 cookie_len = 0;
+
+	if (act->act_cookie)
+		cookie_len = nla_total_size(act->act_cookie->len);
+
+	return  nla_total_size(0) /* action number nested */
+		+ nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
+		+ cookie_len /* TCA_ACT_COOKIE */
+		+ nla_total_size(0) /* TCA_ACT_STATS nested */
+		/* TCA_STATS_BASIC */
+		+ nla_total_size_64bit(sizeof(struct gnet_stats_basic))
+		/* TCA_STATS_QUEUE */
+		+ nla_total_size_64bit(sizeof(struct gnet_stats_queue))
+		+ nla_total_size(0) /* TCA_OPTIONS nested */
+		+ nla_total_size(sizeof(struct tcf_t)); /* TCA_GACT_TM */
+}
+
+static size_t tcf_action_full_attrs_size(size_t sz)
+{
+	return NLMSG_HDRLEN                     /* struct nlmsghdr */
+		+ sizeof(struct tcamsg)
+		+ nla_total_size(0)             /* TCA_ACT_TAB nested */
+		+ sz;
+}
+
+static size_t tcf_action_fill_size(const struct tc_action *act)
+{
+	size_t sz = tcf_action_shared_attrs_size(act);
+
+	if (act->ops->get_fill_size)
+		return act->ops->get_fill_size(act) + sz;
+	return sz;
+}
+
 static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
@@ -746,6 +782,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 {
 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
 	struct tc_action *act;
+	size_t sz = 0;
 	int err;
 	int i;
 
@@ -761,11 +798,14 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla,
 			goto err;
 		}
 		act->order = i;
+		sz += tcf_action_fill_size(act);
 		if (ovr)
 			act->tcfa_refcnt++;
 		list_add_tail(&act->list, actions);
 	}
 
+	*attr_size = tcf_action_full_attrs_size(sz);
+
 	/* Remove the temp refcnt which was necessary to protect against
 	 * destroying an existing action which was being replaced
 	 */
@@ -1056,9 +1096,12 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
 			goto err;
 		}
 		act->order = i;
+		attr_size += tcf_action_fill_size(act);
 		list_add_tail(&act->list, &actions);
 	}
 
+	attr_size = tcf_action_full_attrs_size(attr_size);
+
 	if (event == RTM_GETACTION)
 		ret = tcf_get_notify(net, portid, n, &actions, event, extack);
 	else { /* delete */
-- 
cgit v1.2.3


From 9c5c9c573700a0a4a4afa431da3e7d21f2992627 Mon Sep 17 00:00:00 2001
From: Roman Mashak <mrv@mojatatu.com>
Date: Thu, 8 Mar 2018 16:59:20 -0500
Subject: net sched actions: implement get_fill_size routine in act_gact

Signed-off-by: Roman Mashak <mrv@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_gact.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'net')

diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 74563254e676..88fbb8403565 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -217,6 +217,19 @@ static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index,
 	return tcf_idr_search(tn, a, index);
 }
 
+static size_t tcf_gact_get_fill_size(const struct tc_action *act)
+{
+	size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */
+
+#ifdef CONFIG_GACT_PROB
+	if (to_gact(act)->tcfg_ptype)
+		/* TCA_GACT_PROB */
+		sz += nla_total_size(sizeof(struct tc_gact_p));
+#endif
+
+	return sz;
+}
+
 static struct tc_action_ops act_gact_ops = {
 	.kind		=	"gact",
 	.type		=	TCA_ACT_GACT,
@@ -227,6 +240,7 @@ static struct tc_action_ops act_gact_ops = {
 	.init		=	tcf_gact_init,
 	.walk		=	tcf_gact_walker,
 	.lookup		=	tcf_gact_search,
+	.get_fill_size	=	tcf_gact_get_fill_size,
 	.size		=	sizeof(struct tcf_gact),
 };
 
-- 
cgit v1.2.3


From 35951393bbffa1ce351438bee264b48347c8cbb7 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Thu, 8 Mar 2018 23:43:40 -0600
Subject: pktgen: Remove VLA usage

In preparation to enabling -Wvla, remove VLA usage and replace it
with a fixed-length array instead.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d81bddd1bb80..e2d6ae3b290b 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -907,7 +907,7 @@ static ssize_t pktgen_if_write(struct file *file,
 
 	if (debug) {
 		size_t copy = min_t(size_t, count, 1023);
-		char tb[copy + 1];
+		char tb[1024];
 		if (copy_from_user(tb, user_buffer, copy))
 			return -EFAULT;
 		tb[copy] = 0;
-- 
cgit v1.2.3


From f5426250a6ecfd1e9b2d5e0daf07565f664aa67d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 9 Mar 2018 10:39:24 +0100
Subject: net: introduce IFF_NO_RX_HANDLER

Some network devices - notably ipvlan slave - are not compatible with
any kind of rx_handler. Currently the hook can be installed but any
configuration (bridge, bond, macsec, ...) is nonfunctional.

This change allocates a priv_flag bit to mark such devices and explicitly
forbid installing a rx_handler if such bit is set. The new bit is used
by ipvlan slave device.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index e5b8d42b6410..259abb1515d0 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4351,6 +4351,9 @@ int netdev_rx_handler_register(struct net_device *dev,
 	if (netdev_is_rx_handler_busy(dev))
 		return -EBUSY;
 
+	if (dev->priv_flags & IFF_NO_RX_HANDLER)
+		return -EINVAL;
+
 	/* Note: rx_handler_data must be set before rx_handler */
 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
 	rcu_assign_pointer(dev->rx_handler, rx_handler);
-- 
cgit v1.2.3


From bbfa047a25c379e467536c5ed2ba00c0cb602ca1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 12 Mar 2018 11:09:33 -0400
Subject: ipv6: Use ip6_multipath_hash_policy() in rt6_multipath_hash().

Make use of the new helper.

Suggested-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f0ae58424c45..81711e3e2604 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1846,7 +1846,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
 	struct flow_keys hash_keys;
 	u32 mhash;
 
-	switch (net->ipv6.sysctl.multipath_hash_policy) {
+	switch (ip6_multipath_hash_policy(net)) {
 	case 0:
 		memset(&hash_keys, 0, sizeof(hash_keys));
 		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-- 
cgit v1.2.3


From bdf08fc5412045f7648a49791d98cd04f72c1c1f Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Sun, 11 Mar 2018 17:27:56 +0100
Subject: rds: remove redundant variable 'sg_off'

Variable sg_off is assigned a value but it is never read, hence it is
redundant and can be removed.

Cleans up clang warning:
net/rds/message.c:373:2: warning: Value stored to 'sg_off' is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/message.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/rds/message.c b/net/rds/message.c
index 4462e4c9bd7d..a35f76971984 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -357,7 +357,6 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
 
 static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *from)
 {
-	unsigned long sg_off;
 	struct scatterlist *sg;
 	int ret = 0;
 	int length = iov_iter_count(from);
@@ -370,7 +369,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter *
 	 * now allocate and copy in the data payload.
 	 */
 	sg = rm->data.op_sg;
-	sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
 
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
-- 
cgit v1.2.3


From 678f4bda35483473ae7ad674ece4c7c43a000888 Mon Sep 17 00:00:00 2001
From: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Date: Sun, 11 Mar 2018 22:12:04 +0100
Subject: net: llc: drop VLA in llc_sap_mcast()

Avoid a VLA[1] by using a real constant expression instead of a variable.
The compiler should be able to optimize the original code and avoid using
an actual VLA. Anyway this change is useful because it will avoid a false
positive with -Wvla, it might also help the compiler generating better
code.

[1] https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/llc/llc_sap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index d90928f50226..a7f7b8ff4729 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -394,8 +394,9 @@ static void llc_sap_mcast(struct llc_sap *sap,
 			  const struct llc_addr *laddr,
 			  struct sk_buff *skb)
 {
-	int i = 0, count = 256 / sizeof(struct sock *);
-	struct sock *sk, *stack[count];
+	int i = 0;
+	struct sock *sk;
+	struct sock *stack[256 / sizeof(struct sock *)];
 	struct llc_sock *llc;
 	struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex);
 
@@ -408,7 +409,7 @@ static void llc_sap_mcast(struct llc_sap *sap,
 			continue;
 
 		sock_hold(sk);
-		if (i < count)
+		if (i < ARRAY_SIZE(stack))
 			stack[i++] = sk;
 		else {
 			llc_do_mcast(sap, skb, stack, i);
-- 
cgit v1.2.3


From de8d5ab2ff6edc8e26822965a30b6aa4e9332025 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Mon, 12 Mar 2018 11:48:49 +0200
Subject: net: Make RX-FCS and HW GRO mutually exclusive

Same as LRO, hardware GRO cannot be enabled with RX-FCS.
When both are requested, hardware GRO will be dropped.

Suggested-by: David Miller <davem@davemloft.net>
Signed-off-by: Gal Pressman <galp@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 259abb1515d0..12a9aad0b057 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7549,10 +7549,17 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		}
 	}
 
-	/* LRO feature cannot be combined with RX-FCS */
-	if ((features & NETIF_F_LRO) && (features & NETIF_F_RXFCS)) {
-		netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
-		features &= ~NETIF_F_LRO;
+	/* LRO/HW-GRO features cannot be combined with RX-FCS */
+	if (features & NETIF_F_RXFCS) {
+		if (features & NETIF_F_LRO) {
+			netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
+			features &= ~NETIF_F_LRO;
+		}
+
+		if (features & NETIF_F_GRO_HW) {
+			netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
+			features &= ~NETIF_F_GRO_HW;
+		}
 	}
 
 	return features;
-- 
cgit v1.2.3


From f1cb9d68b4d8789d7f48bd293772095430860d86 Mon Sep 17 00:00:00 2001
From: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Date: Sun, 11 Mar 2018 22:07:49 +0100
Subject: net: rds: drop VLA in rds_for_each_conn_info()

Avoid VLA[1] by using an already allocated buffer passed
by the caller.

[1] https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c | 2 +-
 net/rds/ib.c         | 3 +++
 net/rds/rds.h        | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 2da3176bf792..f80792c719eb 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -540,9 +540,9 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens,
 			  int (*visitor)(struct rds_connection *, void *),
+			  u64 *buffer,
 			  size_t item_len)
 {
-	uint64_t buffer[(item_len + 7) / 8];
 	struct hlist_head *head;
 	struct rds_connection *conn;
 	size_t i;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 50a88f3e7e39..02deee29e7f1 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -321,8 +321,11 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
 			   struct rds_info_iterator *iter,
 			   struct rds_info_lengths *lens)
 {
+	u64 buffer[(sizeof(struct rds_info_rdma_connection) + 7) / 8];
+
 	rds_for_each_conn_info(sock, len, iter, lens,
 				rds_ib_conn_info_visitor,
+				buffer,
 				sizeof(struct rds_info_rdma_connection));
 }
 
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 74cd27c661de..b04c333d9d1c 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -735,6 +735,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens,
 			  int (*visitor)(struct rds_connection *, void *),
+			  u64 *buffer,
 			  size_t item_len);
 
 __printf(2, 3)
-- 
cgit v1.2.3


From b2c9272ae7d366e85ab277f528c3dddb7f268b12 Mon Sep 17 00:00:00 2001
From: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Date: Sun, 11 Mar 2018 22:07:50 +0100
Subject: net: rds: drop VLA in rds_walk_conn_path_info()

Avoid VLA[1] by using an already allocated buffer passed
by the caller.

[1] https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/connection.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rds/connection.c b/net/rds/connection.c
index f80792c719eb..abef75da89a7 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -578,9 +578,9 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
 				    struct rds_info_iterator *iter,
 				    struct rds_info_lengths *lens,
 				    int (*visitor)(struct rds_conn_path *, void *),
+				    u64 *buffer,
 				    size_t item_len)
 {
-	u64  buffer[(item_len + 7) / 8];
 	struct hlist_head *head;
 	struct rds_connection *conn;
 	size_t i;
@@ -649,8 +649,11 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
 			  struct rds_info_iterator *iter,
 			  struct rds_info_lengths *lens)
 {
+	u64 buffer[(sizeof(struct rds_info_connection) + 7) / 8];
+
 	rds_walk_conn_path_info(sock, len, iter, lens,
 				rds_conn_info_visitor,
+				buffer,
 				sizeof(struct rds_info_connection));
 }
 
-- 
cgit v1.2.3


From d98985dd6c2dc69e2ad5f5482a5237fb9487ed99 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Tue, 13 Mar 2018 03:03:30 +0000
Subject: sctp: fix error return code in sctp_sendmsg_new_asoc()

Return error code -EINVAL in the address len check error handling
case since 'err' can be overwrite to 0 by 'err = sctp_verify_addr()'
in the for loop.

Fixes: 2c0dbaa0c43d ("sctp: add support for SCTP_DSTADDRV4/6 Information for sendmsg")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 7d3476a4860d..af5cf29b0c65 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1677,7 +1677,7 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 	struct sctp_association *asoc;
 	enum sctp_scope scope;
 	struct cmsghdr *cmsg;
-	int err = -EINVAL;
+	int err;
 
 	*tp = NULL;
 
@@ -1761,16 +1761,20 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 sflags,
 		memset(daddr, 0, sizeof(*daddr));
 		dlen = cmsg->cmsg_len - sizeof(struct cmsghdr);
 		if (cmsg->cmsg_type == SCTP_DSTADDRV4) {
-			if (dlen < sizeof(struct in_addr))
+			if (dlen < sizeof(struct in_addr)) {
+				err = -EINVAL;
 				goto free;
+			}
 
 			dlen = sizeof(struct in_addr);
 			daddr->v4.sin_family = AF_INET;
 			daddr->v4.sin_port = htons(asoc->peer.port);
 			memcpy(&daddr->v4.sin_addr, CMSG_DATA(cmsg), dlen);
 		} else {
-			if (dlen < sizeof(struct in6_addr))
+			if (dlen < sizeof(struct in6_addr)) {
+				err = -EINVAL;
 				goto free;
+			}
 
 			dlen = sizeof(struct in6_addr);
 			daddr->v6.sin6_family = AF_INET6;
-- 
cgit v1.2.3


From 2e01ae0ef2db96c530249563383f479f8d9ca183 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Mar 2018 13:36:51 +0300
Subject: net: Convert sctp_defaults_ops

These pernet_operations have a deal with sysctl, /proc
entries and statistics. Also, there are freeing of
net::sctp::addr_waitq queue and net::sctp::local_addr_list
in exit method. All of them look pernet-divided, and it
seems these items are only interesting for sctp_defaults_ops,
which are safe to be executed in parallel.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/protocol.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 91813e686c67..32be52304f98 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1330,6 +1330,7 @@ static void __net_exit sctp_defaults_exit(struct net *net)
 static struct pernet_operations sctp_defaults_ops = {
 	.init = sctp_defaults_init,
 	.exit = sctp_defaults_exit,
+	.async = true,
 };
 
 static int __net_init sctp_ctrlsock_init(struct net *net)
-- 
cgit v1.2.3


From bfdfa38ff0e26463e418235c0be03c0f0eca4569 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Mar 2018 13:37:02 +0300
Subject: net: Convert sctp_ctrlsock_ops

These pernet_operations create and destroy net::sctp::ctl_sock.
Since pernet_operations do not send sctp packets each other,
they look safe to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/protocol.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 32be52304f98..606361ee9e4a 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1354,6 +1354,7 @@ static void __net_init sctp_ctrlsock_exit(struct net *net)
 static struct pernet_operations sctp_ctrlsock_ops = {
 	.init = sctp_ctrlsock_init,
 	.exit = sctp_ctrlsock_exit,
+	.async = true,
 };
 
 /* Initialize the universe into something sensible.  */
-- 
cgit v1.2.3


From afbbc374ab1281f2e5c18278a62a47fb906f0fa4 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Mar 2018 13:37:11 +0300
Subject: net: Convert tipc_net_ops

TIPC looks concentrated in itself, and other pernet_operations
seem not touching its entities.

tipc_net_ops look pernet-divided, and they should be safe to
be executed in parallel for several net the same time.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/tipc/core.c b/net/tipc/core.c
index 0b982d048fb9..04fd91bb11d7 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -105,6 +105,7 @@ static struct pernet_operations tipc_net_ops = {
 	.exit = tipc_exit_net,
 	.id   = &tipc_net_id,
 	.size = sizeof(struct tipc_net),
+	.async = true,
 };
 
 static int __init tipc_init(void)
-- 
cgit v1.2.3


From c939a5e4d597d635dee4eab2f27308d50fa606f7 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 13 Mar 2018 13:37:21 +0300
Subject: net: Convert rds_tcp_net_ops

These pernet_operations create and destroy sysctl table
and listen socket. Also, exit method flushes global
workqueue and work. Everything looks per-net safe,
so we can mark them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 08230a145042..eb04e7fa2467 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -515,6 +515,7 @@ static struct pernet_operations rds_tcp_net_ops = {
 	.exit = rds_tcp_exit_net,
 	.id = &rds_tcp_netid,
 	.size = sizeof(struct rds_tcp_net),
+	.async = true,
 };
 
 static void rds_tcp_kill_sock(struct net *net)
-- 
cgit v1.2.3


From 72597135cdd2fe524f9a185d7f954c2c3980f3ee Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 6 Mar 2018 08:26:00 +0100
Subject: netfilter: x_tables: fix build with CONFIG_COMPAT=n

I placed the helpers within CONFIG_COMPAT section, move them
outside.

Fixes: 472ebdcd15ebdb ("netfilter: x_tables: check error target size too")
Fixes: 07a9da51b4b6ae ("netfilter: x_tables: check standard verdicts in core")
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 62 ++++++++++++++++++++++++------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 7521e8a72c06..bac932f1c582 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -577,6 +577,37 @@ error:
 }
 EXPORT_SYMBOL(xt_check_table_hooks);
 
+static bool verdict_ok(int verdict)
+{
+	if (verdict > 0)
+		return true;
+
+	if (verdict < 0) {
+		int v = -verdict - 1;
+
+		if (verdict == XT_RETURN)
+			return true;
+
+		switch (v) {
+		case NF_ACCEPT: return true;
+		case NF_DROP: return true;
+		case NF_QUEUE: return true;
+		default:
+			break;
+		}
+
+		return false;
+	}
+
+	return false;
+}
+
+static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
+			const char *msg, unsigned int msglen)
+{
+	return usersize == kernsize && strnlen(msg, msglen) < msglen;
+}
+
 #ifdef CONFIG_COMPAT
 int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
@@ -736,37 +767,6 @@ struct compat_xt_error_target {
 	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
-static bool verdict_ok(int verdict)
-{
-	if (verdict > 0)
-		return true;
-
-	if (verdict < 0) {
-		int v = -verdict - 1;
-
-		if (verdict == XT_RETURN)
-			return true;
-
-		switch (v) {
-		case NF_ACCEPT: return true;
-		case NF_DROP: return true;
-		case NF_QUEUE: return true;
-		default:
-			break;
-		}
-
-		return false;
-	}
-
-	return false;
-}
-
-static bool error_tg_ok(unsigned int usersize, unsigned int kernsize,
-			const char *msg, unsigned int msglen)
-{
-	return usersize == kernsize && strnlen(msg, msglen) < msglen;
-}
-
 int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int target_offset,
 				  unsigned int next_offset)
-- 
cgit v1.2.3


From a55efe1d416c345a73bef38848e1ac7109560e12 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Mon, 5 Mar 2018 15:35:57 -0600
Subject: ipvs: use true and false for boolean values

Assign true or false to boolean variables instead of an integer value.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_lblc.c  | 4 ++--
 net/netfilter/ipvs/ip_vs_lblcr.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 6a340c94c4b8..942e835caf7f 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -238,7 +238,7 @@ static void ip_vs_lblc_flush(struct ip_vs_service *svc)
 	int i;
 
 	spin_lock_bh(&svc->sched_lock);
-	tbl->dead = 1;
+	tbl->dead = true;
 	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblc_del(en);
@@ -369,7 +369,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
 	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
-	tbl->dead = 0;
+	tbl->dead = false;
 	tbl->svc = svc;
 
 	/*
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 0627881128da..a5acab25c36b 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -404,7 +404,7 @@ static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
 	struct hlist_node *next;
 
 	spin_lock_bh(&svc->sched_lock);
-	tbl->dead = 1;
+	tbl->dead = true;
 	for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
 		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
 			ip_vs_lblcr_free(en);
@@ -532,7 +532,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
 	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
 	tbl->rover = 0;
 	tbl->counter = 1;
-	tbl->dead = 0;
+	tbl->dead = false;
 	tbl->svc = svc;
 
 	/*
-- 
cgit v1.2.3


From a870a02cc963de35452bbed932560ed69725c4f2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 13 Mar 2018 21:58:39 +0100
Subject: pktgen: use dynamic allocation for debug print buffer

After the removal of the VLA, we get a harmless warning about a large
stack frame:

net/core/pktgen.c: In function 'pktgen_if_write':
net/core/pktgen.c:1710:1: error: the frame size of 1076 bytes is larger than 1024 bytes [-Werror=frame-larger-than=]

The function was previously shown to be safe despite hitting
the 1024 bye warning level. To get rid of the annoyging warning,
while keeping it readable, this changes it to use strndup_user().

Obviously this is not a fast path, so the kmalloc() overhead
can be disregarded.

Fixes: 35951393bbff ("pktgen: Remove VLA usage")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index e2d6ae3b290b..fd65761e1fed 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -906,13 +906,14 @@ static ssize_t pktgen_if_write(struct file *file,
 	i += len;
 
 	if (debug) {
-		size_t copy = min_t(size_t, count, 1023);
-		char tb[1024];
-		if (copy_from_user(tb, user_buffer, copy))
-			return -EFAULT;
-		tb[copy] = 0;
-		pr_debug("%s,%lu  buffer -:%s:-\n",
-			 name, (unsigned long)count, tb);
+		size_t copy = min_t(size_t, count + 1, 1024);
+		char *tp = strndup_user(user_buffer, copy);
+
+		if (IS_ERR(tp))
+			return PTR_ERR(tp);
+
+		pr_debug("%s,%zu  buffer -:%s:-\n", name, count, tp);
+		kfree(buf);
 	}
 
 	if (!strcmp(name, "min_pkt_size")) {
-- 
cgit v1.2.3


From 41aeefcc38a2643a62db46818a70a781efb40d99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@c0d3.blue>
Date: Tue, 13 Mar 2018 11:41:12 +0100
Subject: batman-adv: add DAT cache netlink support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dump the list of DAT cache entries via the netlink socket.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/distributed-arp-table.c | 152 +++++++++++++++++++++++++++++++++
 net/batman-adv/distributed-arp-table.h |   8 ++
 net/batman-adv/netlink.c               |  76 ++++++++++-------
 3 files changed, 203 insertions(+), 33 deletions(-)

(limited to 'net')

diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 4469dcc1558f..75dda9454ccf 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -33,6 +33,7 @@
 #include <linux/kernel.h>
 #include <linux/kref.h>
 #include <linux/list.h>
+#include <linux/netlink.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/seq_file.h>
@@ -43,13 +44,19 @@
 #include <linux/string.h>
 #include <linux/workqueue.h>
 #include <net/arp.h>
+#include <net/genetlink.h>
+#include <net/netlink.h>
+#include <net/sock.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "bridge_loop_avoidance.h"
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
+#include "netlink.h"
 #include "originator.h"
 #include "send.h"
+#include "soft-interface.h"
 #include "translation-table.h"
 #include "tvlv.h"
 
@@ -851,6 +858,151 @@ out:
 }
 #endif
 
+/**
+ * batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a
+ *  netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @dat_entry: entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			    struct batadv_dat_entry *dat_entry)
+{
+	int msecs;
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE);
+	if (!hdr)
+		return -ENOBUFS;
+
+	msecs = jiffies_to_msecs(jiffies - dat_entry->last_update);
+
+	if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS,
+			    dat_entry->ip) ||
+	    nla_put(msg, BATADV_ATTR_DAT_CACHE_HWADDRESS, ETH_ALEN,
+		    dat_entry->mac_addr) ||
+	    nla_put_u16(msg, BATADV_ATTR_DAT_CACHE_VID, dat_entry->vid) ||
+	    nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) {
+		genlmsg_cancel(msg, hdr);
+		return -EMSGSIZE;
+	}
+
+	genlmsg_end(msg, hdr);
+	return 0;
+}
+
+/**
+ * batadv_dat_cache_dump_bucket() - dump one bucket of the DAT cache table to
+ *  a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @head: bucket to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+			     struct hlist_head *head, int *idx_skip)
+{
+	struct batadv_dat_entry *dat_entry;
+	int idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
+		if (idx < *idx_skip)
+			goto skip;
+
+		if (batadv_dat_cache_dump_entry(msg, portid, seq,
+						dat_entry)) {
+			rcu_read_unlock();
+			*idx_skip = idx;
+
+			return -EMSGSIZE;
+		}
+
+skip:
+		idx++;
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/**
+ * batadv_dat_cache_dump() - dump DAT cache table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct batadv_hard_iface *primary_if = NULL;
+	int portid = NETLINK_CB(cb->skb).portid;
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_hashtable *hash;
+	struct batadv_priv *bat_priv;
+	int bucket = cb->args[0];
+	struct hlist_head *head;
+	int idx = cb->args[1];
+	int ifindex;
+	int ret = 0;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh,
+					     BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+	hash = bat_priv->dat.hash;
+
+	primary_if = batadv_primary_if_get_selected(bat_priv);
+	if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	while (bucket < hash->size) {
+		head = &hash->table[bucket];
+
+		if (batadv_dat_cache_dump_bucket(msg, portid,
+						 cb->nlh->nlmsg_seq, head,
+						 &idx))
+			break;
+
+		bucket++;
+		idx = 0;
+	}
+
+	cb->args[0] = bucket;
+	cb->args[1] = idx;
+
+	ret = msg->len;
+
+out:
+	if (primary_if)
+		batadv_hardif_put(primary_if);
+
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	return ret;
+}
+
 /**
  * batadv_arp_get_type() - parse an ARP packet and gets the type
  * @bat_priv: the bat priv with all the soft interface information
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index e24aa9601c52..a04596028337 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -28,6 +28,7 @@
 
 #include "originator.h"
 
+struct netlink_callback;
 struct seq_file;
 struct sk_buff;
 
@@ -81,6 +82,7 @@ batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
 int batadv_dat_init(struct batadv_priv *bat_priv);
 void batadv_dat_free(struct batadv_priv *bat_priv);
 int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb);
 
 /**
  * batadv_dat_inc_counter() - increment the correct DAT packet counter
@@ -169,6 +171,12 @@ static inline void batadv_dat_free(struct batadv_priv *bat_priv)
 {
 }
 
+static inline int
+batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv,
 					  u8 subtype)
 {
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 129af56b944d..2b3e7c3f87fa 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -45,6 +45,7 @@
 
 #include "bat_algo.h"
 #include "bridge_loop_avoidance.h"
+#include "distributed-arp-table.h"
 #include "gateway_client.h"
 #include "hard-interface.h"
 #include "originator.h"
@@ -64,39 +65,42 @@ static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
 };
 
 static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
-	[BATADV_ATTR_VERSION]		= { .type = NLA_STRING },
-	[BATADV_ATTR_ALGO_NAME]		= { .type = NLA_STRING },
-	[BATADV_ATTR_MESH_IFINDEX]	= { .type = NLA_U32 },
-	[BATADV_ATTR_MESH_IFNAME]	= { .type = NLA_STRING },
-	[BATADV_ATTR_MESH_ADDRESS]	= { .len = ETH_ALEN },
-	[BATADV_ATTR_HARD_IFINDEX]	= { .type = NLA_U32 },
-	[BATADV_ATTR_HARD_IFNAME]	= { .type = NLA_STRING },
-	[BATADV_ATTR_HARD_ADDRESS]	= { .len = ETH_ALEN },
-	[BATADV_ATTR_ORIG_ADDRESS]	= { .len = ETH_ALEN },
-	[BATADV_ATTR_TPMETER_RESULT]	= { .type = NLA_U8 },
-	[BATADV_ATTR_TPMETER_TEST_TIME]	= { .type = NLA_U32 },
-	[BATADV_ATTR_TPMETER_BYTES]	= { .type = NLA_U64 },
-	[BATADV_ATTR_TPMETER_COOKIE]	= { .type = NLA_U32 },
-	[BATADV_ATTR_ACTIVE]		= { .type = NLA_FLAG },
-	[BATADV_ATTR_TT_ADDRESS]	= { .len = ETH_ALEN },
-	[BATADV_ATTR_TT_TTVN]		= { .type = NLA_U8 },
-	[BATADV_ATTR_TT_LAST_TTVN]	= { .type = NLA_U8 },
-	[BATADV_ATTR_TT_CRC32]		= { .type = NLA_U32 },
-	[BATADV_ATTR_TT_VID]		= { .type = NLA_U16 },
-	[BATADV_ATTR_TT_FLAGS]		= { .type = NLA_U32 },
-	[BATADV_ATTR_FLAG_BEST]		= { .type = NLA_FLAG },
-	[BATADV_ATTR_LAST_SEEN_MSECS]	= { .type = NLA_U32 },
-	[BATADV_ATTR_NEIGH_ADDRESS]	= { .len = ETH_ALEN },
-	[BATADV_ATTR_TQ]		= { .type = NLA_U8 },
-	[BATADV_ATTR_THROUGHPUT]	= { .type = NLA_U32 },
-	[BATADV_ATTR_BANDWIDTH_UP]	= { .type = NLA_U32 },
-	[BATADV_ATTR_BANDWIDTH_DOWN]	= { .type = NLA_U32 },
-	[BATADV_ATTR_ROUTER]		= { .len = ETH_ALEN },
-	[BATADV_ATTR_BLA_OWN]		= { .type = NLA_FLAG },
-	[BATADV_ATTR_BLA_ADDRESS]	= { .len = ETH_ALEN },
-	[BATADV_ATTR_BLA_VID]		= { .type = NLA_U16 },
-	[BATADV_ATTR_BLA_BACKBONE]	= { .len = ETH_ALEN },
-	[BATADV_ATTR_BLA_CRC]		= { .type = NLA_U16 },
+	[BATADV_ATTR_VERSION]			= { .type = NLA_STRING },
+	[BATADV_ATTR_ALGO_NAME]			= { .type = NLA_STRING },
+	[BATADV_ATTR_MESH_IFINDEX]		= { .type = NLA_U32 },
+	[BATADV_ATTR_MESH_IFNAME]		= { .type = NLA_STRING },
+	[BATADV_ATTR_MESH_ADDRESS]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_HARD_IFINDEX]		= { .type = NLA_U32 },
+	[BATADV_ATTR_HARD_IFNAME]		= { .type = NLA_STRING },
+	[BATADV_ATTR_HARD_ADDRESS]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_ORIG_ADDRESS]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_TPMETER_RESULT]		= { .type = NLA_U8 },
+	[BATADV_ATTR_TPMETER_TEST_TIME]		= { .type = NLA_U32 },
+	[BATADV_ATTR_TPMETER_BYTES]		= { .type = NLA_U64 },
+	[BATADV_ATTR_TPMETER_COOKIE]		= { .type = NLA_U32 },
+	[BATADV_ATTR_ACTIVE]			= { .type = NLA_FLAG },
+	[BATADV_ATTR_TT_ADDRESS]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_TT_TTVN]			= { .type = NLA_U8 },
+	[BATADV_ATTR_TT_LAST_TTVN]		= { .type = NLA_U8 },
+	[BATADV_ATTR_TT_CRC32]			= { .type = NLA_U32 },
+	[BATADV_ATTR_TT_VID]			= { .type = NLA_U16 },
+	[BATADV_ATTR_TT_FLAGS]			= { .type = NLA_U32 },
+	[BATADV_ATTR_FLAG_BEST]			= { .type = NLA_FLAG },
+	[BATADV_ATTR_LAST_SEEN_MSECS]		= { .type = NLA_U32 },
+	[BATADV_ATTR_NEIGH_ADDRESS]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_TQ]			= { .type = NLA_U8 },
+	[BATADV_ATTR_THROUGHPUT]		= { .type = NLA_U32 },
+	[BATADV_ATTR_BANDWIDTH_UP]		= { .type = NLA_U32 },
+	[BATADV_ATTR_BANDWIDTH_DOWN]		= { .type = NLA_U32 },
+	[BATADV_ATTR_ROUTER]			= { .len = ETH_ALEN },
+	[BATADV_ATTR_BLA_OWN]			= { .type = NLA_FLAG },
+	[BATADV_ATTR_BLA_ADDRESS]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_BLA_VID]			= { .type = NLA_U16 },
+	[BATADV_ATTR_BLA_BACKBONE]		= { .len = ETH_ALEN },
+	[BATADV_ATTR_BLA_CRC]			= { .type = NLA_U16 },
+	[BATADV_ATTR_DAT_CACHE_IP4ADDRESS]	= { .type = NLA_U32 },
+	[BATADV_ATTR_DAT_CACHE_HWADDRESS]	= { .len = ETH_ALEN },
+	[BATADV_ATTR_DAT_CACHE_VID]		= { .type = NLA_U16 },
 };
 
 /**
@@ -604,6 +608,12 @@ static const struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_bla_backbone_dump,
 	},
+	{
+		.cmd = BATADV_CMD_GET_DAT_CACHE,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_dat_cache_dump,
+	},
 
 };
 
-- 
cgit v1.2.3


From 53dd9a68ba683986ec90497586f94b941bb748a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Linus=20L=C3=BCssing?= <linus.luessing@c0d3.blue>
Date: Tue, 13 Mar 2018 11:41:13 +0100
Subject: batman-adv: add multicast flags netlink support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dump the list of multicast flags entries via the netlink socket.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/multicast.c | 237 +++++++++++++++++++++++++++++++++++++++++++++
 net/batman-adv/multicast.h |  18 ++++
 net/batman-adv/netlink.c   |  12 +++
 3 files changed, 267 insertions(+)

(limited to 'net')

diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index c7a1305ca7e7..5615b6abea6f 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -40,6 +40,7 @@
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/netdevice.h>
+#include <linux/netlink.h>
 #include <linux/printk.h>
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
@@ -52,14 +53,20 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include <net/addrconf.h>
+#include <net/genetlink.h>
 #include <net/if_inet6.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/netlink.h>
+#include <net/sock.h>
 #include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
 
 #include "hard-interface.h"
 #include "hash.h"
 #include "log.h"
+#include "netlink.h"
+#include "soft-interface.h"
 #include "translation-table.h"
 #include "tvlv.h"
 
@@ -1333,6 +1340,236 @@ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
 }
 #endif
 
+/**
+ * batadv_mcast_mesh_info_put() - put multicast info into a netlink message
+ * @msg: buffer for the message
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 or error code.
+ */
+int batadv_mcast_mesh_info_put(struct sk_buff *msg,
+			       struct batadv_priv *bat_priv)
+{
+	u32 flags = bat_priv->mcast.flags;
+	u32 flags_priv = BATADV_NO_FLAGS;
+
+	if (bat_priv->mcast.bridged) {
+		flags_priv |= BATADV_MCAST_FLAGS_BRIDGED;
+
+		if (bat_priv->mcast.querier_ipv4.exists)
+			flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS;
+		if (bat_priv->mcast.querier_ipv6.exists)
+			flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS;
+		if (bat_priv->mcast.querier_ipv4.shadowing)
+			flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING;
+		if (bat_priv->mcast.querier_ipv6.shadowing)
+			flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING;
+	}
+
+	if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) ||
+	    nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+/**
+ * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table
+ *  to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @orig_node: originator to dump the multicast flags of
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+			      struct batadv_orig_node *orig_node)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
+			  NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
+		    orig_node->orig)) {
+		genlmsg_cancel(msg, hdr);
+		return -EMSGSIZE;
+	}
+
+	if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
+		     &orig_node->capabilities)) {
+		if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS,
+				orig_node->mcast_flags)) {
+			genlmsg_cancel(msg, hdr);
+			return -EMSGSIZE;
+		}
+	}
+
+	genlmsg_end(msg, hdr);
+	return 0;
+}
+
+/**
+ * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags
+ *  table to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @head: bucket to dump
+ * @idx_skip: How many entries to skip
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+			       struct hlist_head *head, long *idx_skip)
+{
+	struct batadv_orig_node *orig_node;
+	long idx = 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+		if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
+			      &orig_node->capa_initialized))
+			continue;
+
+		if (idx < *idx_skip)
+			goto skip;
+
+		if (batadv_mcast_flags_dump_entry(msg, portid, seq,
+						  orig_node)) {
+			rcu_read_unlock();
+			*idx_skip = idx;
+
+			return -EMSGSIZE;
+		}
+
+skip:
+		idx++;
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/**
+ * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket
+ * @msg: buffer for the message
+ * @portid: netlink port
+ * @seq: Sequence number of netlink message
+ * @bat_priv: the bat priv with all the soft interface information
+ * @bucket: current bucket to dump
+ * @idx: index in current bucket to the next entry to dump
+ *
+ * Return: 0 or error code.
+ */
+static int
+__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq,
+			  struct batadv_priv *bat_priv, long *bucket, long *idx)
+{
+	struct batadv_hashtable *hash = bat_priv->orig_hash;
+	long bucket_tmp = *bucket;
+	struct hlist_head *head;
+	long idx_tmp = *idx;
+
+	while (bucket_tmp < hash->size) {
+		head = &hash->table[bucket_tmp];
+
+		if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head,
+						   &idx_tmp))
+			break;
+
+		bucket_tmp++;
+		idx_tmp = 0;
+	}
+
+	*bucket = bucket_tmp;
+	*idx = idx_tmp;
+
+	return msg->len;
+}
+
+/**
+ * batadv_mcast_netlink_get_primary() - get primary interface from netlink
+ *  callback
+ * @cb: netlink callback structure
+ * @primary_if: the primary interface pointer to return the result in
+ *
+ * Return: 0 or error code.
+ */
+static int
+batadv_mcast_netlink_get_primary(struct netlink_callback *cb,
+				 struct batadv_hard_iface **primary_if)
+{
+	struct batadv_hard_iface *hard_iface = NULL;
+	struct net *net = sock_net(cb->skb->sk);
+	struct net_device *soft_iface;
+	struct batadv_priv *bat_priv;
+	int ifindex;
+	int ret = 0;
+
+	ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
+	if (!ifindex)
+		return -EINVAL;
+
+	soft_iface = dev_get_by_index(net, ifindex);
+	if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	bat_priv = netdev_priv(soft_iface);
+
+	hard_iface = batadv_primary_if_get_selected(bat_priv);
+	if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+out:
+	if (soft_iface)
+		dev_put(soft_iface);
+
+	if (!ret && primary_if)
+		*primary_if = hard_iface;
+	else
+		batadv_hardif_put(hard_iface);
+
+	return ret;
+}
+
+/**
+ * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket
+ * @msg: buffer for the message
+ * @cb: callback structure containing arguments
+ *
+ * Return: message length.
+ */
+int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+	struct batadv_hard_iface *primary_if = NULL;
+	int portid = NETLINK_CB(cb->skb).portid;
+	struct batadv_priv *bat_priv;
+	long *bucket = &cb->args[0];
+	long *idx = &cb->args[1];
+	int ret;
+
+	ret = batadv_mcast_netlink_get_primary(cb, &primary_if);
+	if (ret)
+		return ret;
+
+	bat_priv = netdev_priv(primary_if->soft_iface);
+	ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq,
+					bat_priv, bucket, idx);
+
+	batadv_hardif_put(primary_if);
+	return ret;
+}
+
 /**
  * batadv_mcast_free() - free the multicast optimizations structures
  * @bat_priv: the bat priv with all the soft interface information
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 6b8594e23da3..3b04ab13f0eb 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -21,6 +21,7 @@
 
 #include "main.h"
 
+struct netlink_callback;
 struct seq_file;
 struct sk_buff;
 
@@ -54,6 +55,11 @@ void batadv_mcast_init(struct batadv_priv *bat_priv);
 
 int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset);
 
+int batadv_mcast_mesh_info_put(struct sk_buff *msg,
+			       struct batadv_priv *bat_priv);
+
+int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb);
+
 void batadv_mcast_free(struct batadv_priv *bat_priv);
 
 void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
@@ -72,6 +78,18 @@ static inline int batadv_mcast_init(struct batadv_priv *bat_priv)
 	return 0;
 }
 
+static inline int
+batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv)
+{
+	return 0;
+}
+
+static inline int batadv_mcast_flags_dump(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline void batadv_mcast_free(struct batadv_priv *bat_priv)
 {
 }
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 2b3e7c3f87fa..0d9459b69bdb 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -48,6 +48,7 @@
 #include "distributed-arp-table.h"
 #include "gateway_client.h"
 #include "hard-interface.h"
+#include "multicast.h"
 #include "originator.h"
 #include "soft-interface.h"
 #include "tp_meter.h"
@@ -101,6 +102,8 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
 	[BATADV_ATTR_DAT_CACHE_IP4ADDRESS]	= { .type = NLA_U32 },
 	[BATADV_ATTR_DAT_CACHE_HWADDRESS]	= { .len = ETH_ALEN },
 	[BATADV_ATTR_DAT_CACHE_VID]		= { .type = NLA_U16 },
+	[BATADV_ATTR_MCAST_FLAGS]		= { .type = NLA_U32 },
+	[BATADV_ATTR_MCAST_FLAGS_PRIV]		= { .type = NLA_U32 },
 };
 
 /**
@@ -151,6 +154,9 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
 		goto out;
 #endif
 
+	if (batadv_mcast_mesh_info_put(msg, bat_priv))
+		goto out;
+
 	primary_if = batadv_primary_if_get_selected(bat_priv);
 	if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) {
 		hard_iface = primary_if->net_dev;
@@ -614,6 +620,12 @@ static const struct genl_ops batadv_netlink_ops[] = {
 		.policy = batadv_netlink_policy,
 		.dumpit = batadv_dat_cache_dump,
 	},
+	{
+		.cmd = BATADV_CMD_GET_MCAST_FLAGS,
+		.flags = GENL_ADMIN_PERM,
+		.policy = batadv_netlink_policy,
+		.dumpit = batadv_mcast_flags_dump,
+	},
 
 };
 
-- 
cgit v1.2.3


From 29d1df72ce2ac187d457990fe445a16212dcfa19 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Wed, 14 Mar 2018 03:07:27 -0500
Subject: pktgen: Fix memory leak in pktgen_if_write

_buf_ is an array and the one that must be freed is _tp_ instead.

Fixes: a870a02cc963 ("pktgen: use dynamic allocation for debug print buffer")
Reported-by: Wang Jian <jianjian.wang1@gmail.com>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index fd65761e1fed..545cf08cd558 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -913,7 +913,7 @@ static ssize_t pktgen_if_write(struct file *file,
 			return PTR_ERR(tp);
 
 		pr_debug("%s,%zu  buffer -:%s:-\n", name, count, tp);
-		kfree(buf);
+		kfree(tp);
 	}
 
 	if (!strcmp(name, "min_pkt_size")) {
-- 
cgit v1.2.3


From ced68234b6a244355aeab07c2681bc49f695eaed Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 14 Mar 2018 12:49:19 -0400
Subject: sock: remove zerocopy sockopt restriction on closed tcp state

Socket option SO_ZEROCOPY determines whether the kernel ignores or
processes flag MSG_ZEROCOPY on subsequent send calls. This to avoid
changing behavior for legacy processes.

Limiting the state change to closed sockets is annoying with passive
sockets and not necessary for correctness. Once created, zerocopy skbs
are processed based on their private state, not this socket flag.

Remove the constraint.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 27f218bba43f..a8962d912895 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1052,8 +1052,6 @@ set_rcvbuf:
 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
 			if (sk->sk_protocol != IPPROTO_TCP)
 				ret = -ENOTSUPP;
-			else if (sk->sk_state != TCP_CLOSE)
-				ret = -EBUSY;
 		} else if (sk->sk_family != PF_RDS) {
 			ret = -ENOTSUPP;
 		}
-- 
cgit v1.2.3


From c9f4c6cf53bfafb639386a4c094929f13f573e04 Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 14 Mar 2018 11:01:00 +0100
Subject: net/smc: pay attention to MAX_ORDER for CQ entries

smc allocates a certain number of CQ entries for used RoCE devices. For
mlx5 devices the chosen constant number results in a large allocation
causing this warning:

[13355.124656] WARNING: CPU: 3 PID: 16535 at mm/page_alloc.c:3883 __alloc_pages_nodemask+0x2be/0x10c0
[13355.124657] Modules linked in: smc_diag(O) smc(O) xt_CHECKSUM iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ip6table_filter ip6_tables iptable_filter mlx5_ib ib_core sunrpc mlx5_core s390_trng rng_core ghash_s390 prng aes_s390 des_s390 des_generic sha512_s390 sha256_s390 sha1_s390 sha_common ptp pps_core eadm_sch dm_multipath dm_mod vhost_net tun vhost tap sch_fq_codel kvm ip_tables x_tables autofs4 [last unloaded: smc]
[13355.124672] CPU: 3 PID: 16535 Comm: kworker/3:0 Tainted: G           O    4.14.0uschi #1
[13355.124673] Hardware name: IBM 3906 M04 704 (LPAR)
[13355.124675] Workqueue: events smc_listen_work [smc]
[13355.124677] task: 00000000e2f22100 task.stack: 0000000084720000
[13355.124678] Krnl PSW : 0704c00180000000 000000000029da76 (__alloc_pages_nodemask+0x2be/0x10c0)
[13355.124681]            R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[13355.124682] Krnl GPRS: 0000000000000000 00550e00014080c0 0000000000000000 0000000000000001
[13355.124684]            000000000029d8b6 00000000f3bfd710 0000000000000000 00000000014080c0
[13355.124685]            0000000000000009 00000000ec277a00 0000000000200000 0000000000000000
[13355.124686]            0000000000000000 00000000000001ff 000000000029d8b6 0000000084723720
[13355.124708] Krnl Code: 000000000029da6a: a7110200		tmll	%r1,512
                          000000000029da6e: a774ff29		brc	7,29d8c0
                         #000000000029da72: a7f40001		brc	15,29da74
                         >000000000029da76: a7f4ff25		brc	15,29d8c0
                          000000000029da7a: a7380000		lhi	%r3,0
                          000000000029da7e: a7f4fef1		brc	15,29d860
                          000000000029da82: 5820f0c4		l	%r2,196(%r15)
                          000000000029da86: a53e0048		llilh	%r3,72
[13355.124720] Call Trace:
[13355.124722] ([<000000000029d8b6>] __alloc_pages_nodemask+0xfe/0x10c0)
[13355.124724]  [<000000000013bd1e>] s390_dma_alloc+0x6e/0x148
[13355.124733]  [<000003ff802eeba6>] mlx5_dma_zalloc_coherent_node+0x8e/0xe0 [mlx5_core]
[13355.124740]  [<000003ff802eee18>] mlx5_buf_alloc_node+0x70/0x108 [mlx5_core]
[13355.124744]  [<000003ff804eb410>] mlx5_ib_create_cq+0x558/0x898 [mlx5_ib]
[13355.124749]  [<000003ff80407d40>] ib_create_cq+0x48/0x88 [ib_core]
[13355.124751]  [<000003ff80109fba>] smc_ib_setup_per_ibdev+0x52/0x118 [smc]
[13355.124753]  [<000003ff8010bcb6>] smc_conn_create+0x65e/0x728 [smc]
[13355.124755]  [<000003ff801081a2>] smc_listen_work+0x2d2/0x540 [smc]
[13355.124756]  [<0000000000162c66>] process_one_work+0x1be/0x440
[13355.124758]  [<0000000000162f40>] worker_thread+0x58/0x458
[13355.124759]  [<0000000000169e7e>] kthread+0x14e/0x168
[13355.124760]  [<00000000009ce8be>] kernel_thread_starter+0x6/0xc
[13355.124762]  [<00000000009ce8b8>] kernel_thread_starter+0x0/0xc
[13355.124762] Last Breaking-Event-Address:
[13355.124764]  [<000000000029da72>] __alloc_pages_nodemask+0x2ba/0x10c0
[13355.124764] ---[ end trace 34be38b581c0b585 ]---

This patch reduces the smc constant for the maximum number of allocated
completion queue entries SMC_MAX_CQE by 2 to avoid high round up values
in the mlx5 code, and reduces the number of allocated completion queue
entries even more, if the final allocation for an mlx5 device hits the
MAX_ORDER limit.

Reported-by: Ihnken Menssen <menssen@de.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_ib.c | 10 +++++++++-
 net/smc/smc_wr.h |  1 -
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 2a8957bd6d38..26df554f7588 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -23,6 +23,8 @@
 #include "smc_wr.h"
 #include "smc.h"
 
+#define SMC_MAX_CQE 32766	/* max. # of completion queue elements */
+
 #define SMC_QP_MIN_RNR_TIMER		5
 #define SMC_QP_TIMEOUT			15 /* 4096 * 2 ** timeout usec */
 #define SMC_QP_RETRY_CNT			7 /* 7: infinite */
@@ -438,9 +440,15 @@ out:
 long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
 {
 	struct ib_cq_init_attr cqattr =	{
-		.cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
+		.cqe = SMC_MAX_CQE, .comp_vector = 0 };
+	int cqe_size_order, smc_order;
 	long rc;
 
+	/* the calculated number of cq entries fits to mlx5 cq allocation */
+	cqe_size_order = cache_line_size() == 128 ? 7 : 6;
+	smc_order = MAX_ORDER - cqe_size_order - 1;
+	if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
+		cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
 	smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
 					      smc_wr_tx_cq_handler, NULL,
 					      smcibdev, &cqattr);
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index ef0c3494c9cb..210bec3c3ebe 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -19,7 +19,6 @@
 #include "smc.h"
 #include "smc_core.h"
 
-#define SMC_WR_MAX_CQE 32768	/* max. # of completion queue elements */
 #define SMC_WR_BUF_CNT 16	/* # of ctrl buffers per link */
 
 #define SMC_WR_TX_WAIT_FREE_SLOT_TIME	(10 * HZ)
-- 
cgit v1.2.3


From 268ffcc4ebfc8b10c1502357dfb82ce6af0770ac Mon Sep 17 00:00:00 2001
From: Ursula Braun <ubraun@linux.vnet.ibm.com>
Date: Wed, 14 Mar 2018 11:01:01 +0100
Subject: net/smc: free link group without pending free_work only

Make sure there is no pending or running free_work worker for the link
group when freeing the link group.

Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c   | 1 +
 net/smc/smc_core.c | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 2c6f4e0a9f3d..649489f825a5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1477,6 +1477,7 @@ static void __exit smc_exit(void)
 	spin_unlock_bh(&smc_lgr_list.lock);
 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
 		list_del_init(&lgr->list);
+		cancel_delayed_work_sync(&lgr->free_work);
 		smc_lgr_free(lgr); /* free link group */
 	}
 	static_branch_disable(&tcp_have_smc);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f76f60e463cb..ec6189fe2a48 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -140,7 +140,8 @@ static void smc_lgr_free_work(struct work_struct *work)
 	list_del_init(&lgr->list); /* remove from smc_lgr_list */
 free:
 	spin_unlock_bh(&smc_lgr_list.lock);
-	smc_lgr_free(lgr);
+	if (!delayed_work_pending(&lgr->free_work))
+		smc_lgr_free(lgr);
 }
 
 /* create a new SMC link group */
-- 
cgit v1.2.3


From 97cdbc4213df101228564b8d27090cc1f9a26332 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Wed, 14 Mar 2018 11:01:02 +0100
Subject: net/smc: schedule free_work when link group is terminated

The free_work worker must be scheduled when the link group is
abnormally terminated.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_core.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index ec6189fe2a48..f44f6803f7ff 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -32,6 +32,17 @@
 
 static u32 smc_lgr_num;			/* unique link group number */
 
+static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
+{
+	/* client link group creation always follows the server link group
+	 * creation. For client use a somewhat higher removal delay time,
+	 * otherwise there is a risk of out-of-sync link groups.
+	 */
+	mod_delayed_work(system_wq, &lgr->free_work,
+			 lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
+						 SMC_LGR_FREE_DELAY_SERV);
+}
+
 /* Register connection's alert token in our lookup structure.
  * To use rbtrees we have to implement our own insert core.
  * Requires @conns_lock
@@ -111,13 +122,7 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
 	write_unlock_bh(&lgr->conns_lock);
 	if (!reduced || lgr->conns_num)
 		return;
-	/* client link group creation always follows the server link group
-	 * creation. For client use a somewhat higher removal delay time,
-	 * otherwise there is a risk of out-of-sync link groups.
-	 */
-	mod_delayed_work(system_wq, &lgr->free_work,
-			 lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT :
-						 SMC_LGR_FREE_DELAY_SERV);
+	smc_lgr_schedule_free_work(lgr);
 }
 
 static void smc_lgr_free_work(struct work_struct *work)
@@ -344,6 +349,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr)
 	}
 	write_unlock_bh(&lgr->conns_lock);
 	wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait);
+	smc_lgr_schedule_free_work(lgr);
 }
 
 /* Determine vlan of internal TCP socket.
-- 
cgit v1.2.3


From 1b1e0bc9947427ae58bbe7de0ce9cfd591b589b9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Mar 2018 19:05:30 +0800
Subject: sctp: add refcnt support for sh_key

With refcnt support for sh_key, chunks auth sh_keys can be decided
before enqueuing it. Changing the active key later will not affect
the chunks already enqueued.

Furthermore, this is necessary when adding the support for authinfo
for sendmsg in next patch.

Note that struct sctp_chunk can't be grown due to that performance
drop issue on slow cpu, so it just reuses head_skb memory for shkey
in sctp_chunk.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/auth.c          | 86 ++++++++++++++++++++++++------------------------
 net/sctp/chunk.c         |  5 +++
 net/sctp/output.c        | 18 ++++++++--
 net/sctp/sm_make_chunk.c | 15 +++++++--
 net/sctp/sm_statefuns.c  | 11 ++++---
 net/sctp/socket.c        |  6 ++++
 6 files changed, 89 insertions(+), 52 deletions(-)

(limited to 'net')

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 00667c50efa7..e5214fd7f650 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -101,13 +101,14 @@ struct sctp_shared_key *sctp_auth_shkey_create(__u16 key_id, gfp_t gfp)
 		return NULL;
 
 	INIT_LIST_HEAD(&new->key_list);
+	refcount_set(&new->refcnt, 1);
 	new->key_id = key_id;
 
 	return new;
 }
 
 /* Free the shared key structure */
-static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key)
+static void sctp_auth_shkey_destroy(struct sctp_shared_key *sh_key)
 {
 	BUG_ON(!list_empty(&sh_key->key_list));
 	sctp_auth_key_put(sh_key->key);
@@ -115,6 +116,17 @@ static void sctp_auth_shkey_free(struct sctp_shared_key *sh_key)
 	kfree(sh_key);
 }
 
+void sctp_auth_shkey_release(struct sctp_shared_key *sh_key)
+{
+	if (refcount_dec_and_test(&sh_key->refcnt))
+		sctp_auth_shkey_destroy(sh_key);
+}
+
+void sctp_auth_shkey_hold(struct sctp_shared_key *sh_key)
+{
+	refcount_inc(&sh_key->refcnt);
+}
+
 /* Destroy the entire key list.  This is done during the
  * associon and endpoint free process.
  */
@@ -128,7 +140,7 @@ void sctp_auth_destroy_keys(struct list_head *keys)
 
 	key_for_each_safe(ep_key, tmp, keys) {
 		list_del_init(&ep_key->key_list);
-		sctp_auth_shkey_free(ep_key);
+		sctp_auth_shkey_release(ep_key);
 	}
 }
 
@@ -409,13 +421,19 @@ int sctp_auth_asoc_init_active_key(struct sctp_association *asoc, gfp_t gfp)
 
 	sctp_auth_key_put(asoc->asoc_shared_key);
 	asoc->asoc_shared_key = secret;
+	asoc->shkey = ep_key;
 
 	/* Update send queue in case any chunk already in there now
 	 * needs authenticating
 	 */
 	list_for_each_entry(chunk, &asoc->outqueue.out_chunk_list, list) {
-		if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc))
+		if (sctp_auth_send_cid(chunk->chunk_hdr->type, asoc)) {
 			chunk->auth = 1;
+			if (!chunk->shkey) {
+				chunk->shkey = asoc->shkey;
+				sctp_auth_shkey_hold(chunk->shkey);
+			}
+		}
 	}
 
 	return 0;
@@ -703,16 +721,15 @@ int sctp_auth_recv_cid(enum sctp_cid chunk, const struct sctp_association *asoc)
  *    after the AUTH chunk in the SCTP packet.
  */
 void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
-			      struct sk_buff *skb,
-			      struct sctp_auth_chunk *auth,
-			      gfp_t gfp)
+			      struct sk_buff *skb, struct sctp_auth_chunk *auth,
+			      struct sctp_shared_key *ep_key, gfp_t gfp)
 {
-	struct crypto_shash *tfm;
 	struct sctp_auth_bytes *asoc_key;
+	struct crypto_shash *tfm;
 	__u16 key_id, hmac_id;
-	__u8 *digest;
 	unsigned char *end;
 	int free_key = 0;
+	__u8 *digest;
 
 	/* Extract the info we need:
 	 * - hmac id
@@ -724,12 +741,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
 	if (key_id == asoc->active_key_id)
 		asoc_key = asoc->asoc_shared_key;
 	else {
-		struct sctp_shared_key *ep_key;
-
-		ep_key = sctp_auth_get_shkey(asoc, key_id);
-		if (!ep_key)
-			return;
-
+		/* ep_key can't be NULL here */
 		asoc_key = sctp_auth_asoc_create_secret(asoc, ep_key, gfp);
 		if (!asoc_key)
 			return;
@@ -829,7 +841,7 @@ int sctp_auth_set_key(struct sctp_endpoint *ep,
 		      struct sctp_association *asoc,
 		      struct sctp_authkey *auth_key)
 {
-	struct sctp_shared_key *cur_key = NULL;
+	struct sctp_shared_key *cur_key, *shkey;
 	struct sctp_auth_bytes *key;
 	struct list_head *sh_keys;
 	int replace = 0;
@@ -842,46 +854,34 @@ int sctp_auth_set_key(struct sctp_endpoint *ep,
 	else
 		sh_keys = &ep->endpoint_shared_keys;
 
-	key_for_each(cur_key, sh_keys) {
-		if (cur_key->key_id == auth_key->sca_keynumber) {
+	key_for_each(shkey, sh_keys) {
+		if (shkey->key_id == auth_key->sca_keynumber) {
 			replace = 1;
 			break;
 		}
 	}
 
-	/* If we are not replacing a key id, we need to allocate
-	 * a shared key.
-	 */
-	if (!replace) {
-		cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber,
-						 GFP_KERNEL);
-		if (!cur_key)
-			return -ENOMEM;
-	}
+	cur_key = sctp_auth_shkey_create(auth_key->sca_keynumber, GFP_KERNEL);
+	if (!cur_key)
+		return -ENOMEM;
 
 	/* Create a new key data based on the info passed in */
 	key = sctp_auth_create_key(auth_key->sca_keylength, GFP_KERNEL);
-	if (!key)
-		goto nomem;
+	if (!key) {
+		kfree(cur_key);
+		return -ENOMEM;
+	}
 
 	memcpy(key->data, &auth_key->sca_key[0], auth_key->sca_keylength);
+	cur_key->key = key;
 
-	/* If we are replacing, remove the old keys data from the
-	 * key id.  If we are adding new key id, add it to the
-	 * list.
-	 */
-	if (replace)
-		sctp_auth_key_put(cur_key->key);
-	else
-		list_add(&cur_key->key_list, sh_keys);
+	if (replace) {
+		list_del_init(&shkey->key_list);
+		sctp_auth_shkey_release(shkey);
+	}
+	list_add(&cur_key->key_list, sh_keys);
 
-	cur_key->key = key;
 	return 0;
-nomem:
-	if (!replace)
-		sctp_auth_shkey_free(cur_key);
-
-	return -ENOMEM;
 }
 
 int sctp_auth_set_active_key(struct sctp_endpoint *ep,
@@ -952,7 +952,7 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep,
 
 	/* Delete the shared key */
 	list_del_init(&key->key_list);
-	sctp_auth_shkey_free(key);
+	sctp_auth_shkey_release(key);
 
 	return 0;
 }
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 991a530c6b31..9f28a9aae976 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -168,6 +168,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 {
 	size_t len, first_len, max_data, remaining;
 	size_t msg_len = iov_iter_count(from);
+	struct sctp_shared_key *shkey = NULL;
 	struct list_head *pos, *temp;
 	struct sctp_chunk *chunk;
 	struct sctp_datamsg *msg;
@@ -204,6 +205,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		if (hmac_desc)
 			max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) +
 					      hmac_desc->hmac_len);
+
+		shkey = asoc->shkey;
 	}
 
 	/* Check what's our max considering the above */
@@ -275,6 +278,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 		if (err < 0)
 			goto errout_chunk_free;
 
+		chunk->shkey = shkey;
+
 		/* Put the chunk->skb back into the form expected by send.  */
 		__skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr -
 				       chunk->skb->data);
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 01a26ee051e3..d6e1c90cc09a 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -241,10 +241,13 @@ static enum sctp_xmit sctp_packet_bundle_auth(struct sctp_packet *pkt,
 	if (!chunk->auth)
 		return retval;
 
-	auth = sctp_make_auth(asoc);
+	auth = sctp_make_auth(asoc, chunk->shkey->key_id);
 	if (!auth)
 		return retval;
 
+	auth->shkey = chunk->shkey;
+	sctp_auth_shkey_hold(auth->shkey);
+
 	retval = __sctp_packet_append_chunk(pkt, auth);
 
 	if (retval != SCTP_XMIT_OK)
@@ -490,7 +493,8 @@ merge:
 		}
 
 		if (auth) {
-			sctp_auth_calculate_hmac(tp->asoc, nskb, auth, gfp);
+			sctp_auth_calculate_hmac(tp->asoc, nskb, auth,
+						 packet->auth->shkey, gfp);
 			/* free auth if no more chunks, or add it back */
 			if (list_empty(&packet->chunk_list))
 				sctp_chunk_free(packet->auth);
@@ -770,6 +774,16 @@ static enum sctp_xmit sctp_packet_will_fit(struct sctp_packet *packet,
 	enum sctp_xmit retval = SCTP_XMIT_OK;
 	size_t psize, pmtu, maxsize;
 
+	/* Don't bundle in this packet if this chunk's auth key doesn't
+	 * match other chunks already enqueued on this packet. Also,
+	 * don't bundle the chunk with auth key if other chunks in this
+	 * packet don't have auth key.
+	 */
+	if ((packet->auth && chunk->shkey != packet->auth->shkey) ||
+	    (!packet->auth && chunk->shkey &&
+	     chunk->chunk_hdr->type != SCTP_CID_AUTH))
+		return SCTP_XMIT_PMTU_FULL;
+
 	psize = packet->size;
 	if (packet->transport->asoc)
 		pmtu = packet->transport->asoc->pathmtu;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index d01475f5f710..10f071cdf188 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -87,7 +87,10 @@ static void  *sctp_addto_chunk_fixed(struct sctp_chunk *, int len,
 /* Control chunk destructor */
 static void sctp_control_release_owner(struct sk_buff *skb)
 {
-	/*TODO: do memory release */
+	struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg;
+
+	if (chunk->shkey)
+		sctp_auth_shkey_release(chunk->shkey);
 }
 
 static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
@@ -102,7 +105,12 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
 	 *
 	 *  For now don't do anything for now.
 	 */
+	if (chunk->auth) {
+		chunk->shkey = asoc->shkey;
+		sctp_auth_shkey_hold(chunk->shkey);
+	}
 	skb->sk = asoc ? asoc->base.sk : NULL;
+	skb_shinfo(skb)->destructor_arg = chunk;
 	skb->destructor = sctp_control_release_owner;
 }
 
@@ -1271,7 +1279,8 @@ nodata:
 	return retval;
 }
 
-struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
+struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc,
+				  __u16 key_id)
 {
 	struct sctp_authhdr auth_hdr;
 	struct sctp_hmac *hmac_desc;
@@ -1289,7 +1298,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
 		return NULL;
 
 	auth_hdr.hmac_id = htons(hmac_desc->hmac_id);
-	auth_hdr.shkey_id = htons(asoc->active_key_id);
+	auth_hdr.shkey_id = htons(key_id);
 
 	retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr),
 						 &auth_hdr);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index eb7905ffe5f2..792e0e2be320 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4114,6 +4114,7 @@ static enum sctp_ierror sctp_sf_authenticate(
 					const union sctp_subtype type,
 					struct sctp_chunk *chunk)
 {
+	struct sctp_shared_key *sh_key = NULL;
 	struct sctp_authhdr *auth_hdr;
 	__u8 *save_digest, *digest;
 	struct sctp_hmac *hmac;
@@ -4135,9 +4136,11 @@ static enum sctp_ierror sctp_sf_authenticate(
 	 * configured
 	 */
 	key_id = ntohs(auth_hdr->shkey_id);
-	if (key_id != asoc->active_key_id && !sctp_auth_get_shkey(asoc, key_id))
-		return SCTP_IERROR_AUTH_BAD_KEYID;
-
+	if (key_id != asoc->active_key_id) {
+		sh_key = sctp_auth_get_shkey(asoc, key_id);
+		if (!sh_key)
+			return SCTP_IERROR_AUTH_BAD_KEYID;
+	}
 
 	/* Make sure that the length of the signature matches what
 	 * we expect.
@@ -4166,7 +4169,7 @@ static enum sctp_ierror sctp_sf_authenticate(
 
 	sctp_auth_calculate_hmac(asoc, chunk->skb,
 				 (struct sctp_auth_chunk *)chunk->chunk_hdr,
-				 GFP_ATOMIC);
+				 sh_key, GFP_ATOMIC);
 
 	/* Discard the packet if the digests do not match */
 	if (memcmp(save_digest, digest, sig_len)) {
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index af5cf29b0c65..003a4ad89c01 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -156,6 +156,9 @@ static inline void sctp_set_owner_w(struct sctp_chunk *chunk)
 	/* The sndbuf space is tracked per association.  */
 	sctp_association_hold(asoc);
 
+	if (chunk->shkey)
+		sctp_auth_shkey_hold(chunk->shkey);
+
 	skb_set_owner_w(chunk->skb, sk);
 
 	chunk->skb->destructor = sctp_wfree;
@@ -8109,6 +8112,9 @@ static void sctp_wfree(struct sk_buff *skb)
 	sk->sk_wmem_queued   -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
 
+	if (chunk->shkey)
+		sctp_auth_shkey_release(chunk->shkey);
+
 	sock_wfree(skb);
 	sctp_wake_up_waiters(sk, asoc);
 
-- 
cgit v1.2.3


From 3ff547c06a7d75d72d37dae2c064fcf0672e56c0 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Mar 2018 19:05:31 +0800
Subject: sctp: add support for SCTP AUTH Information for sendmsg

This patch is to add support for SCTP AUTH Information for sendmsg,
as described in section 5.3.8 of RFC6458.

With this option, you can provide shared key identifier used for
sending the user message.

It's also a necessary send info for sctp_sendv.

Note that it reuses sinfo->sinfo_tsn to indicate if this option is
set and sinfo->sinfo_ssn to save the shkey ID which can be 0.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/chunk.c  | 11 ++++++++++-
 net/sctp/socket.c | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 9f28a9aae976..f889a84f264d 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -206,7 +206,16 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
 			max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) +
 					      hmac_desc->hmac_len);
 
-		shkey = asoc->shkey;
+		if (sinfo->sinfo_tsn &&
+		    sinfo->sinfo_ssn != asoc->active_key_id) {
+			shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn);
+			if (!shkey) {
+				err = -EINVAL;
+				goto errout;
+			}
+		} else {
+			shkey = asoc->shkey;
+		}
 	}
 
 	/* Check what's our max considering the above */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 003a4ad89c01..9ffdecbc3531 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1987,6 +1987,14 @@ static void sctp_sendmsg_update_sinfo(struct sctp_association *asoc,
 
 	if (!cmsgs->srinfo && !cmsgs->prinfo)
 		sinfo->sinfo_timetolive = asoc->default_timetolive;
+
+	if (cmsgs->authinfo) {
+		/* Reuse sinfo_tsn to indicate that authinfo was set and
+		 * sinfo_ssn to save the keyid on tx path.
+		 */
+		sinfo->sinfo_tsn = 1;
+		sinfo->sinfo_ssn = cmsgs->authinfo->auth_keynumber;
+	}
 }
 
 static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
@@ -7874,6 +7882,21 @@ static int sctp_msghdr_parse(const struct msghdr *msg, struct sctp_cmsgs *cmsgs)
 			if (cmsgs->prinfo->pr_policy == SCTP_PR_SCTP_NONE)
 				cmsgs->prinfo->pr_value = 0;
 			break;
+		case SCTP_AUTHINFO:
+			/* SCTP Socket API Extension
+			 * 5.3.8 SCTP AUTH Information Structure (SCTP_AUTHINFO)
+			 *
+			 * This cmsghdr structure specifies SCTP options for sendmsg().
+			 *
+			 * cmsg_level    cmsg_type      cmsg_data[]
+			 * ------------  ------------   ---------------------
+			 * IPPROTO_SCTP  SCTP_AUTHINFO  struct sctp_authinfo
+			 */
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sctp_authinfo)))
+				return -EINVAL;
+
+			cmsgs->authinfo = CMSG_DATA(cmsg);
+			break;
 		case SCTP_DSTADDRV4:
 		case SCTP_DSTADDRV6:
 			/* SCTP Socket API Extension
-- 
cgit v1.2.3


From 601590ec155aadf5daa17a6f63a06d1bba5b5ce9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Mar 2018 19:05:32 +0800
Subject: sctp: add sockopt SCTP_AUTH_DEACTIVATE_KEY

This patch is to add sockopt SCTP_AUTH_DEACTIVATE_KEY, as described in
section 8.3.4 of RFC6458.

This set option indicates that the application will no longer send user
messages using the indicated key identifier.

Note that RFC requires that only deactivated keys that are no longer used
by an association can be deleted, but for the backward compatibility, it
is not to check deactivated when deleting or replacing one sh_key.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/auth.c   | 46 +++++++++++++++++++++++++++++++++++++++++++---
 net/sctp/socket.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index e5214fd7f650..a073123fc485 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -449,8 +449,11 @@ struct sctp_shared_key *sctp_auth_get_shkey(
 
 	/* First search associations set of endpoint pair shared keys */
 	key_for_each(key, &asoc->endpoint_shared_keys) {
-		if (key->key_id == key_id)
-			return key;
+		if (key->key_id == key_id) {
+			if (!key->deactivated)
+				return key;
+			break;
+		}
 	}
 
 	return NULL;
@@ -905,7 +908,7 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep,
 		}
 	}
 
-	if (!found)
+	if (!found || key->deactivated)
 		return -EINVAL;
 
 	if (asoc) {
@@ -956,3 +959,40 @@ int sctp_auth_del_key_id(struct sctp_endpoint *ep,
 
 	return 0;
 }
+
+int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
+			   struct sctp_association *asoc, __u16  key_id)
+{
+	struct sctp_shared_key *key;
+	struct list_head *sh_keys;
+	int found = 0;
+
+	/* The key identifier MUST NOT be the current active key
+	 * The key identifier MUST correst to an existing key
+	 */
+	if (asoc) {
+		if (asoc->active_key_id == key_id)
+			return -EINVAL;
+
+		sh_keys = &asoc->endpoint_shared_keys;
+	} else {
+		if (ep->active_key_id == key_id)
+			return -EINVAL;
+
+		sh_keys = &ep->endpoint_shared_keys;
+	}
+
+	key_for_each(key, sh_keys) {
+		if (key->key_id == key_id) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	key->deactivated = 1;
+
+	return 0;
+}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9ffdecbc3531..65cc354c520f 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3646,6 +3646,33 @@ static int sctp_setsockopt_del_key(struct sock *sk,
 
 }
 
+/*
+ * 8.3.4  Deactivate a Shared Key (SCTP_AUTH_DEACTIVATE_KEY)
+ *
+ * This set option will deactivate a shared secret key.
+ */
+static int sctp_setsockopt_deactivate_key(struct sock *sk, char __user *optval,
+					  unsigned int optlen)
+{
+	struct sctp_endpoint *ep = sctp_sk(sk)->ep;
+	struct sctp_authkeyid val;
+	struct sctp_association *asoc;
+
+	if (!ep->auth_enable)
+		return -EACCES;
+
+	if (optlen != sizeof(struct sctp_authkeyid))
+		return -EINVAL;
+	if (copy_from_user(&val, optval, optlen))
+		return -EFAULT;
+
+	asoc = sctp_id2assoc(sk, val.scact_assoc_id);
+	if (!asoc && val.scact_assoc_id && sctp_style(sk, UDP))
+		return -EINVAL;
+
+	return sctp_auth_deact_key_id(ep, asoc, val.scact_keynumber);
+}
+
 /*
  * 8.1.23 SCTP_AUTO_ASCONF
  *
@@ -4238,6 +4265,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTH_DELETE_KEY:
 		retval = sctp_setsockopt_del_key(sk, optval, optlen);
 		break;
+	case SCTP_AUTH_DEACTIVATE_KEY:
+		retval = sctp_setsockopt_deactivate_key(sk, optval, optlen);
+		break;
 	case SCTP_AUTO_ASCONF:
 		retval = sctp_setsockopt_auto_asconf(sk, optval, optlen);
 		break;
@@ -7212,6 +7242,7 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
 	case SCTP_AUTH_KEY:
 	case SCTP_AUTH_CHUNK:
 	case SCTP_AUTH_DELETE_KEY:
+	case SCTP_AUTH_DEACTIVATE_KEY:
 		retval = -EOPNOTSUPP;
 		break;
 	case SCTP_HMAC_IDENT:
-- 
cgit v1.2.3


From ec2e506c680deaa8e1a087986db6d73ba63a04be Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Mar 2018 19:05:33 +0800
Subject: sctp: add SCTP_AUTH_FREE_KEY type for AUTHENTICATION_EVENT

This patch is to add SCTP_AUTH_FREE_KEY type for AUTHENTICATION_EVENT,
as described in section 6.1.8 of RFC6458.

      SCTP_AUTH_FREE_KEY:  This report indicates that the SCTP
         implementation will no longer use the key identifier specified
         in auth_keynumber.

After deactivating a key, it would never be used again, which means
it's refcnt can't be held/increased by new chunks. But there may be
some chunks in out queue still using it. So only when refcnt is 1,
which means no chunk in outqueue is using/holding this key either,
this EVENT would be sent.

When users receive this notification, they could do DEL_KEY sockopt to
remove this shkey, and also tell the peer that this key won't be used
in any chunk thoroughly from now on, then the peer can remove it as
well safely.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/auth.c          | 14 ++++++++++++++
 net/sctp/sm_make_chunk.c | 20 +++++++++++++++++++-
 net/sctp/sm_statefuns.c  |  2 +-
 net/sctp/socket.c        | 19 ++++++++++++++++++-
 4 files changed, 52 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index a073123fc485..e64630cd3331 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -992,6 +992,20 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
 	if (!found)
 		return -EINVAL;
 
+	/* refcnt == 1 and !list_empty mean it's not being used anywhere
+	 * and deactivated will be set, so it's time to notify userland
+	 * that this shkey can be freed.
+	 */
+	if (asoc && !list_empty(&key->key_list) &&
+	    refcount_read(&key->refcnt) == 1) {
+		struct sctp_ulpevent *ev;
+
+		ev = sctp_ulpevent_make_authkey(asoc, key->key_id,
+						SCTP_AUTH_FREE_KEY, GFP_KERNEL);
+		if (ev)
+			asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
+	}
+
 	key->deactivated = 1;
 
 	return 0;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 10f071cdf188..cc20bc39ee7c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -89,8 +89,26 @@ static void sctp_control_release_owner(struct sk_buff *skb)
 {
 	struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg;
 
-	if (chunk->shkey)
+	if (chunk->shkey) {
+		struct sctp_shared_key *shkey = chunk->shkey;
+		struct sctp_association *asoc = chunk->asoc;
+
+		/* refcnt == 2 and !list_empty mean after this release, it's
+		 * not being used anywhere, and it's time to notify userland
+		 * that this shkey can be freed if it's been deactivated.
+		 */
+		if (shkey->deactivated && !list_empty(&shkey->key_list) &&
+		    refcount_read(&shkey->refcnt) == 2) {
+			struct sctp_ulpevent *ev;
+
+			ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id,
+							SCTP_AUTH_FREE_KEY,
+							GFP_KERNEL);
+			if (ev)
+				asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
+		}
 		sctp_auth_shkey_release(chunk->shkey);
+	}
 }
 
 static void sctp_control_set_owner_w(struct sctp_chunk *chunk)
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 792e0e2be320..1e41dee70b51 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -4246,7 +4246,7 @@ enum sctp_disposition sctp_sf_eat_auth(struct net *net,
 		struct sctp_ulpevent *ev;
 
 		ev = sctp_ulpevent_make_authkey(asoc, ntohs(auth_hdr->shkey_id),
-				    SCTP_AUTH_NEWKEY, GFP_ATOMIC);
+				    SCTP_AUTH_NEW_KEY, GFP_ATOMIC);
 
 		if (!ev)
 			return -ENOMEM;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 65cc354c520f..aeecdd620c45 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -8166,8 +8166,25 @@ static void sctp_wfree(struct sk_buff *skb)
 	sk->sk_wmem_queued   -= skb->truesize;
 	sk_mem_uncharge(sk, skb->truesize);
 
-	if (chunk->shkey)
+	if (chunk->shkey) {
+		struct sctp_shared_key *shkey = chunk->shkey;
+
+		/* refcnt == 2 and !list_empty mean after this release, it's
+		 * not being used anywhere, and it's time to notify userland
+		 * that this shkey can be freed if it's been deactivated.
+		 */
+		if (shkey->deactivated && !list_empty(&shkey->key_list) &&
+		    refcount_read(&shkey->refcnt) == 2) {
+			struct sctp_ulpevent *ev;
+
+			ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id,
+							SCTP_AUTH_FREE_KEY,
+							GFP_KERNEL);
+			if (ev)
+				asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
+		}
 		sctp_auth_shkey_release(chunk->shkey);
+	}
 
 	sock_wfree(skb);
 	sctp_wake_up_waiters(sk, asoc);
-- 
cgit v1.2.3


From 30f6ebf65bc46161c5aaff1db2e6e7c76aa4a06b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 14 Mar 2018 19:05:34 +0800
Subject: sctp: add SCTP_AUTH_NO_AUTH type for AUTHENTICATION_EVENT

This patch is to add SCTP_AUTH_NO_AUTH type for AUTHENTICATION_EVENT,
as described in section 6.1.8 of RFC6458.

      SCTP_AUTH_NO_AUTH:  This report indicates that the peer does not
         support SCTP authentication as defined in [RFC4895].

Note that the implementation is quite similar as that of
SCTP_ADAPTATION_INDICATION.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/sm_sideeffect.c | 13 +++++++++++++
 net/sctp/sm_statefuns.c  | 43 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index b71e7fb0a20a..298112ca8c06 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1049,6 +1049,16 @@ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands,
 		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
 }
 
+static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands,
+				  struct sctp_association *asoc)
+{
+	struct sctp_ulpevent *ev;
+
+	ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC);
+	if (ev)
+		asoc->stream.si->enqueue_event(&asoc->ulpq, ev);
+}
+
 /* Helper function to generate an adaptation indication event */
 static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands,
 				    struct sctp_association *asoc)
@@ -1755,6 +1765,9 @@ static int sctp_cmd_interpreter(enum sctp_event event_type,
 		case SCTP_CMD_ADAPTATION_IND:
 			sctp_cmd_adaptation_ind(commands, asoc);
 			break;
+		case SCTP_CMD_PEER_NO_AUTH:
+			sctp_cmd_peer_no_auth(commands, asoc);
+			break;
 
 		case SCTP_CMD_ASSOC_SHKEY:
 			error = sctp_auth_asoc_init_active_key(asoc,
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 1e41dee70b51..cc56a67dbb4d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -659,7 +659,7 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 					 void *arg,
 					 struct sctp_cmd_seq *commands)
 {
-	struct sctp_ulpevent *ev, *ai_ev = NULL;
+	struct sctp_ulpevent *ev, *ai_ev = NULL, *auth_ev = NULL;
 	struct sctp_association *new_asoc;
 	struct sctp_init_chunk *peer_init;
 	struct sctp_chunk *chunk = arg;
@@ -820,6 +820,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 			goto nomem_aiev;
 	}
 
+	if (!new_asoc->peer.auth_capable) {
+		auth_ev = sctp_ulpevent_make_authkey(new_asoc, 0,
+						     SCTP_AUTH_NO_AUTH,
+						     GFP_ATOMIC);
+		if (!auth_ev)
+			goto nomem_authev;
+	}
+
 	/* Add all the state machine commands now since we've created
 	 * everything.  This way we don't introduce memory corruptions
 	 * during side-effect processing and correclty count established
@@ -847,8 +855,14 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net,
 		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
 				SCTP_ULPEVENT(ai_ev));
 
+	if (auth_ev)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(auth_ev));
+
 	return SCTP_DISPOSITION_CONSUME;
 
+nomem_authev:
+	sctp_ulpevent_free(ai_ev);
 nomem_aiev:
 	sctp_ulpevent_free(ev);
 nomem_ev:
@@ -953,6 +967,15 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net,
 				SCTP_ULPEVENT(ev));
 	}
 
+	if (!asoc->peer.auth_capable) {
+		ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH,
+						GFP_ATOMIC);
+		if (!ev)
+			goto nomem;
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(ev));
+	}
+
 	return SCTP_DISPOSITION_CONSUME;
 nomem:
 	return SCTP_DISPOSITION_NOMEM;
@@ -1908,6 +1931,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_b(
 	if (asoc->peer.adaptation_ind)
 		sctp_add_cmd_sf(commands, SCTP_CMD_ADAPTATION_IND, SCTP_NULL());
 
+	if (!asoc->peer.auth_capable)
+		sctp_add_cmd_sf(commands, SCTP_CMD_PEER_NO_AUTH, SCTP_NULL());
+
 	return SCTP_DISPOSITION_CONSUME;
 
 nomem:
@@ -1954,7 +1980,7 @@ static enum sctp_disposition sctp_sf_do_dupcook_d(
 					struct sctp_cmd_seq *commands,
 					struct sctp_association *new_asoc)
 {
-	struct sctp_ulpevent *ev = NULL, *ai_ev = NULL;
+	struct sctp_ulpevent *ev = NULL, *ai_ev = NULL, *auth_ev = NULL;
 	struct sctp_chunk *repl;
 
 	/* Clarification from Implementor's Guide:
@@ -2001,6 +2027,14 @@ static enum sctp_disposition sctp_sf_do_dupcook_d(
 				goto nomem;
 
 		}
+
+		if (!asoc->peer.auth_capable) {
+			auth_ev = sctp_ulpevent_make_authkey(asoc, 0,
+							     SCTP_AUTH_NO_AUTH,
+							     GFP_ATOMIC);
+			if (!auth_ev)
+				goto nomem;
+		}
 	}
 
 	repl = sctp_make_cookie_ack(new_asoc, chunk);
@@ -2015,10 +2049,15 @@ static enum sctp_disposition sctp_sf_do_dupcook_d(
 	if (ai_ev)
 		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
 					SCTP_ULPEVENT(ai_ev));
+	if (auth_ev)
+		sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
+				SCTP_ULPEVENT(auth_ev));
 
 	return SCTP_DISPOSITION_CONSUME;
 
 nomem:
+	if (auth_ev)
+		sctp_ulpevent_free(auth_ev);
 	if (ai_ev)
 		sctp_ulpevent_free(ai_ev);
 	if (ev)
-- 
cgit v1.2.3


From 0c3d5a96d5e5e6d5662d335a3bdfb76f35433f18 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 12 Mar 2018 08:07:12 -0700
Subject: net: drivers/net: Remove unnecessary skb_copy_expand OOM messages

skb_copy_expand without __GFP_NOWARN already does a dump_stack
on OOM so these messages are redundant.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mac80211/rx.c               | 5 +----
 net/netfilter/nfnetlink_queue.c | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index d01743234cf6..9c898a3688c6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2549,11 +2549,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
 
 	fwd_skb = skb_copy_expand(skb, local->tx_headroom +
 				       sdata->encrypt_headroom, 0, GFP_ATOMIC);
-	if (!fwd_skb) {
-		net_info_ratelimited("%s: failed to clone mesh frame\n",
-				    sdata->name);
+	if (!fwd_skb)
 		goto out;
-	}
 
 	fwd_hdr =  (struct ieee80211_hdr *) fwd_skb->data;
 	fwd_hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_RETRY);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 59e2833c17f1..9f572ed56208 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -833,11 +833,8 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
 		if (diff > skb_tailroom(e->skb)) {
 			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
 					       diff, GFP_ATOMIC);
-			if (!nskb) {
-				printk(KERN_WARNING "nf_queue: OOM "
-				      "in mangle, dropping packet\n");
+			if (!nskb)
 				return -ENOMEM;
-			}
 			kfree_skb(e->skb);
 			e->skb = nskb;
 		}
-- 
cgit v1.2.3


From 0aee4c259849099cb07ead6cd7fff74e561d5225 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Mon, 12 Mar 2018 14:15:25 -0400
Subject: sctp: Fix double free in sctp_sendmsg_to_asoc

syzbot/kasan detected a double free in sctp_sendmsg_to_asoc:
BUG: KASAN: use-after-free in sctp_association_free+0x7b7/0x930
net/sctp/associola.c:332
Read of size 8 at addr ffff8801d8006ae0 by task syzkaller914861/4202

CPU: 1 PID: 4202 Comm: syzkaller914861 Not tainted 4.16.0-rc4+ #258
Hardware name: Google Google Compute Engine/Google Compute Engine
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x194/0x24d lib/dump_stack.c:53
 print_address_description+0x73/0x250 mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report+0x23c/0x360 mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 sctp_association_free+0x7b7/0x930 net/sctp/associola.c:332
 sctp_sendmsg+0xc67/0x1a80 net/sctp/socket.c:2075
 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763
 sock_sendmsg_nosec net/socket.c:629 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:639
 SYSC_sendto+0x361/0x5c0 net/socket.c:1748
 SyS_sendto+0x40/0x50 net/socket.c:1716
 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
 entry_SYSCALL_64_after_hwframe+0x42/0xb7

This was introduced by commit:
f84af33 sctp: factor out sctp_sendmsg_to_asoc from sctp_sendmsg

As the newly refactored function moved the wait_for_sndbuf call to a
point after the association was connected, allowing for peeloff events
to occur, which in turn caused wait_for_sndbuf to return -EPIPE which
was not caught by the logic that determines if an association should be
freed or not.

Fix it the easy way by returning the ordering of
sctp_primitive_ASSOCIATE and sctp_wait_for_sndbuf to the old order, to
ensure that EPIPE will not happen.

Tested by myself using the syzbot reproducers with positive results

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: davem@davemloft.net
CC: Xin Long <lucien.xin@gmail.com>
Reported-by: syzbot+a4e4112c3aff00c8cfd8@syzkaller.appspotmail.com
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/socket.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aeecdd620c45..7a10ae3c3d82 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1883,6 +1883,19 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		goto err;
 	}
 
+	if (asoc->pmtu_pending)
+		sctp_assoc_pending_pmtu(asoc);
+
+	if (sctp_wspace(asoc) < msg_len)
+		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
+
+	if (!sctp_wspace(asoc)) {
+		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
+		if (err)
+			goto err;
+	}
+
 	if (sctp_state(asoc, CLOSED)) {
 		err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
 		if (err)
@@ -1900,19 +1913,6 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
 		pr_debug("%s: we associated primitively\n", __func__);
 	}
 
-	if (asoc->pmtu_pending)
-		sctp_assoc_pending_pmtu(asoc);
-
-	if (sctp_wspace(asoc) < msg_len)
-		sctp_prsctp_prune(asoc, sinfo, msg_len - sctp_wspace(asoc));
-
-	if (!sctp_wspace(asoc)) {
-		timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-		err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
-		if (err)
-			goto err;
-	}
-
 	datamsg = sctp_datamsg_from_user(asoc, sinfo, &msg->msg_iter);
 	if (IS_ERR(datamsg)) {
 		err = PTR_ERR(datamsg);
-- 
cgit v1.2.3


From 650b4eca47399420a1644056c41b66735e250d35 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 12 Mar 2018 17:25:38 +0000
Subject: rxrpc: remove redundant initialization of variable 'len'

The variable 'len' is being initialized with a value that is never
read and it is re-assigned later, hence the initialization is redundant
and can be removed.

Cleans up clang warning:
net/rxrpc/recvmsg.c:275:15: warning: Value stored to 'len' during its
initialization is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/recvmsg.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 9d45d8b56744..7bff716e911e 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -272,7 +272,7 @@ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb,
 			     unsigned int *_offset, unsigned int *_len)
 {
 	unsigned int offset = sizeof(struct rxrpc_wire_header);
-	unsigned int len = *_len;
+	unsigned int len;
 	int ret;
 	u8 annotation = *_annotation;
 
-- 
cgit v1.2.3


From 9fbb704c3385adf5f9adfce7fd3c0bf31aa4da6d Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Mar 2018 08:29:36 -0700
Subject: net/ipv6: Refactor gateway validation on route add

Move gateway validation code from ip6_route_info_create into
ip6_validate_gw. Code move plus adjustments to handle the potential
reset of dev and idev and to make checkpatch happy.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 120 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 66 insertions(+), 54 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 81711e3e2604..23ced851fdb1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2550,7 +2550,7 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
 
 static int ip6_route_check_nh_onlink(struct net *net,
 				     struct fib6_config *cfg,
-				     struct net_device *dev,
+				     const struct net_device *dev,
 				     struct netlink_ext_ack *extack)
 {
 	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
@@ -2626,6 +2626,68 @@ out:
 	return err;
 }
 
+static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
+			   struct net_device **_dev, struct inet6_dev **idev,
+			   struct netlink_ext_ack *extack)
+{
+	const struct in6_addr *gw_addr = &cfg->fc_gateway;
+	int gwa_type = ipv6_addr_type(gw_addr);
+	const struct net_device *dev = *_dev;
+	int err = -EINVAL;
+
+	/* if gw_addr is local we will fail to detect this in case
+	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
+	 * will return already-added prefix route via interface that
+	 * prefix route was assigned to, which might be non-loopback.
+	 */
+	if (ipv6_chk_addr_and_flags(net, gw_addr,
+				    gwa_type & IPV6_ADDR_LINKLOCAL ?
+				    dev : NULL, 0, 0)) {
+		NL_SET_ERR_MSG(extack, "Invalid gateway address");
+		goto out;
+	}
+
+	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
+		/* IPv6 strictly inhibits using not link-local
+		 * addresses as nexthop address.
+		 * Otherwise, router will not able to send redirects.
+		 * It is very good, but in some (rare!) circumstances
+		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
+		 * some exceptions. --ANK
+		 * We allow IPv4-mapped nexthops to support RFC4798-type
+		 * addressing
+		 */
+		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
+			NL_SET_ERR_MSG(extack, "Invalid gateway address");
+			goto out;
+		}
+
+		if (cfg->fc_flags & RTNH_F_ONLINK)
+			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
+		else
+			err = ip6_route_check_nh(net, cfg, _dev, idev);
+
+		if (err)
+			goto out;
+	}
+
+	/* reload in case device was changed */
+	dev = *_dev;
+
+	err = -EINVAL;
+	if (!dev) {
+		NL_SET_ERR_MSG(extack, "Egress device not specified");
+		goto out;
+	} else if (dev->flags & IFF_LOOPBACK) {
+		NL_SET_ERR_MSG(extack,
+			       "Egress device can not be loopback device for this route");
+		goto out;
+	}
+	err = 0;
+out:
+	return err;
+}
+
 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 					      struct netlink_ext_ack *extack)
 {
@@ -2808,61 +2870,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	}
 
 	if (cfg->fc_flags & RTF_GATEWAY) {
-		const struct in6_addr *gw_addr;
-		int gwa_type;
-
-		gw_addr = &cfg->fc_gateway;
-		gwa_type = ipv6_addr_type(gw_addr);
-
-		/* if gw_addr is local we will fail to detect this in case
-		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
-		 * will return already-added prefix route via interface that
-		 * prefix route was assigned to, which might be non-loopback.
-		 */
-		err = -EINVAL;
-		if (ipv6_chk_addr_and_flags(net, gw_addr,
-					    gwa_type & IPV6_ADDR_LINKLOCAL ?
-					    dev : NULL, 0, 0)) {
-			NL_SET_ERR_MSG(extack, "Invalid gateway address");
+		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
+		if (err)
 			goto out;
-		}
-		rt->rt6i_gateway = *gw_addr;
-
-		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
-			/* IPv6 strictly inhibits using not link-local
-			   addresses as nexthop address.
-			   Otherwise, router will not able to send redirects.
-			   It is very good, but in some (rare!) circumstances
-			   (SIT, PtP, NBMA NOARP links) it is handy to allow
-			   some exceptions. --ANK
-			   We allow IPv4-mapped nexthops to support RFC4798-type
-			   addressing
-			 */
-			if (!(gwa_type & (IPV6_ADDR_UNICAST |
-					  IPV6_ADDR_MAPPED))) {
-				NL_SET_ERR_MSG(extack,
-					       "Invalid gateway address");
-				goto out;
-			}
 
-			if (cfg->fc_flags & RTNH_F_ONLINK) {
-				err = ip6_route_check_nh_onlink(net, cfg, dev,
-								extack);
-			} else {
-				err = ip6_route_check_nh(net, cfg, &dev, &idev);
-			}
-			if (err)
-				goto out;
-		}
-		err = -EINVAL;
-		if (!dev) {
-			NL_SET_ERR_MSG(extack, "Egress device not specified");
-			goto out;
-		} else if (dev->flags & IFF_LOOPBACK) {
-			NL_SET_ERR_MSG(extack,
-				       "Egress device can not be loopback device for this route");
-			goto out;
-		}
+		rt->rt6i_gateway = cfg->fc_gateway;
 	}
 
 	err = -ENODEV;
-- 
cgit v1.2.3


From 232378e8db4780bc7145d7a0ee47f5f80a41ad6b Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Mar 2018 08:29:37 -0700
Subject: net/ipv6: Change address check to always take a device argument

ipv6_chk_addr_and_flags determines if an address is a local address and
optionally if it is an address on a specific device. For example, it is
called by ip6_route_info_create to determine if a given gateway address
is a local address. The address check currently does not consider L3
domains and as a result does not allow a route to be added in one VRF
if the nexthop points to an address in a second VRF. e.g.,

    $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23
    Error: Invalid gateway address.

where 2001:db8:102::23 is an address on an interface in vrf r1.

ipv6_chk_addr_and_flags needs to allow callers to always pass in a device
with a separate argument to not limit the address to the specific device.
The device is used used to determine the L3 domain of interest.

To that end add an argument to skip the device check and update callers
to always pass a device where possible and use the new argument to mean
any address in the domain.

Update a handful of users of ipv6_chk_addr with a NULL dev argument. This
patch handles the change to these callers without adding the domain check.

ip6_validate_gw needs to handle 2 cases - one where the device is given
as part of the nexthop spec and the other where the device is resolved.
There is at least 1 VRF case where deferring the check to only after
the route lookup has resolved the device fails with an unintuitive error
"RTNETLINK answers: No route to host" as opposed to the preferred
"Error: Gateway can not be a local address." The 'no route to host'
error is because of the fallback to a full lookup. The check is done
twice to avoid this error.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c   | 11 ++++++++---
 net/ipv6/anycast.c    |  9 ++++++---
 net/ipv6/datagram.c   |  5 +++--
 net/ipv6/ip6_tunnel.c | 12 ++++++++----
 net/ipv6/ndisc.c      |  2 +-
 net/ipv6/route.c      | 19 +++++++++++++++----
 6 files changed, 41 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b5fd116c046a..0677b9732d56 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1851,19 +1851,24 @@ static int ipv6_count_addresses(const struct inet6_dev *idev)
 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 		  const struct net_device *dev, int strict)
 {
-	return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
+	return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
+				       strict, IFA_F_TENTATIVE);
 }
 EXPORT_SYMBOL(ipv6_chk_addr);
 
 int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
-			    const struct net_device *dev, int strict,
-			    u32 banned_flags)
+			    const struct net_device *dev, bool skip_dev_check,
+			    int strict, u32 banned_flags)
 {
 	unsigned int hash = inet6_addr_hash(net, addr);
 	struct inet6_ifaddr *ifp;
 	u32 ifp_flags;
 
 	rcu_read_lock();
+
+	if (skip_dev_check)
+		dev = NULL;
+
 	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
 		if (!net_eq(dev_net(ifp->idev->dev), net))
 			continue;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index c61718dba2e6..d580d4d456a5 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -66,7 +66,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 		return -EPERM;
 	if (ipv6_addr_is_multicast(addr))
 		return -EINVAL;
-	if (ipv6_chk_addr(net, addr, NULL, 0))
+
+	if (ifindex)
+		dev = __dev_get_by_index(net, ifindex);
+
+	if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE))
 		return -EINVAL;
 
 	pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
@@ -90,8 +94,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 			dev = __dev_get_by_flags(net, IFF_UP,
 						 IFF_UP | IFF_LOOPBACK);
 		}
-	} else
-		dev = __dev_get_by_index(net, ifindex);
+	}
 
 	if (!dev) {
 		err = -ENODEV;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index fbf08ce3f5ab..b27333d7b099 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -801,8 +801,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 			if (addr_type != IPV6_ADDR_ANY) {
 				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
 				if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
-				    !ipv6_chk_addr(net, &src_info->ipi6_addr,
-						   strict ? dev : NULL, 0) &&
+				    !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
+							     dev, !strict, 0,
+							     IFA_F_TENTATIVE) &&
 				    !ipv6_chk_acast_addr_src(net, dev,
 							     &src_info->ipi6_addr))
 					err = -EINVAL;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 5c045fa407da..456fcf942f95 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -758,9 +758,11 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
 			ldev = dev_get_by_index_rcu(net, p->link);
 
 		if ((ipv6_addr_is_multicast(laddr) ||
-		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
+		     likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false,
+						    0, IFA_F_TENTATIVE))) &&
 		    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
-		     likely(!ipv6_chk_addr(net, raddr, NULL, 0))))
+		     likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true,
+						     0, IFA_F_TENTATIVE))))
 			ret = 1;
 	}
 	return ret;
@@ -990,12 +992,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
 		if (p->link)
 			ldev = dev_get_by_index_rcu(net, p->link);
 
-		if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
+		if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
+						      0, IFA_F_TENTATIVE)))
 			pr_warn("%s xmit: Local address not yet configured!\n",
 				p->name);
 		else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
 			 !ipv6_addr_is_multicast(raddr) &&
-			 unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
+			 unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
+							  true, 0, IFA_F_TENTATIVE)))
 			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
 				p->name);
 		else
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 8af5eef464c1..10024eb0c521 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -707,7 +707,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	int probes = atomic_read(&neigh->probes);
 
 	if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
-					   dev, 1,
+					   dev, false, 1,
 					   IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
 		saddr = &ipv6_hdr(skb)->saddr;
 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 23ced851fdb1..939d122e71b4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2632,7 +2632,9 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
 {
 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
 	int gwa_type = ipv6_addr_type(gw_addr);
+	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
 	const struct net_device *dev = *_dev;
+	bool need_addr_check = !dev;
 	int err = -EINVAL;
 
 	/* if gw_addr is local we will fail to detect this in case
@@ -2640,10 +2642,9 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
 	 * will return already-added prefix route via interface that
 	 * prefix route was assigned to, which might be non-loopback.
 	 */
-	if (ipv6_chk_addr_and_flags(net, gw_addr,
-				    gwa_type & IPV6_ADDR_LINKLOCAL ?
-				    dev : NULL, 0, 0)) {
-		NL_SET_ERR_MSG(extack, "Invalid gateway address");
+	if (dev &&
+	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
+		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
 		goto out;
 	}
 
@@ -2683,6 +2684,16 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
 			       "Egress device can not be loopback device for this route");
 		goto out;
 	}
+
+	/* if we did not check gw_addr above, do so now that the
+	 * egress device has been resolved.
+	 */
+	if (need_addr_check &&
+	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
+		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
+		goto out;
+	}
+
 	err = 0;
 out:
 	return err;
-- 
cgit v1.2.3


From 1893ff20275b9b9be4892b5b5785788ce45abdea Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 13 Mar 2018 08:29:38 -0700
Subject: net/ipv6: Add l3mdev check to ipv6_chk_addr_and_flags

Lookup the L3 master device for the passed in device. Only consider
addresses on netdev's with the same master device. If the device is
not enslaved or is NULL, then the l3mdev is NULL which means only
devices not enslaved (ie, in the default domain) are considered.

Signed-off-by: David Ahern <dsahern@gmail.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0677b9732d56..6fd4bbdc444f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1856,22 +1856,37 @@ int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 }
 EXPORT_SYMBOL(ipv6_chk_addr);
 
+/* device argument is used to find the L3 domain of interest. If
+ * skip_dev_check is set, then the ifp device is not checked against
+ * the passed in dev argument. So the 2 cases for addresses checks are:
+ *   1. does the address exist in the L3 domain that dev is part of
+ *      (skip_dev_check = true), or
+ *
+ *   2. does the address exist on the specific device
+ *      (skip_dev_check = false)
+ */
 int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
 			    const struct net_device *dev, bool skip_dev_check,
 			    int strict, u32 banned_flags)
 {
 	unsigned int hash = inet6_addr_hash(net, addr);
+	const struct net_device *l3mdev;
 	struct inet6_ifaddr *ifp;
 	u32 ifp_flags;
 
 	rcu_read_lock();
 
+	l3mdev = l3mdev_master_dev_rcu(dev);
 	if (skip_dev_check)
 		dev = NULL;
 
 	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
 		if (!net_eq(dev_net(ifp->idev->dev), net))
 			continue;
+
+		if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
+			continue;
+
 		/* Decouple optimistic from tentative for evaluation here.
 		 * Ban optimistic addresses explicitly, when required.
 		 */
-- 
cgit v1.2.3


From 1e8029515816f771b9b3751f24f19fe6df4c72ae Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Date: Tue, 13 Mar 2018 21:57:16 -0700
Subject: udp: Move the udp sysctl to namespace.

This patch moves the udp_rmem_min, udp_wmem_min
to namespace and init the udp_l3mdev_accept explicitly.

The udp_rmem_min/udp_wmem_min affect udp rx/tx queue,
with this patch namespaces can set them differently.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/sysctl_net_ipv4.c | 32 ++++++++---------
 net/ipv4/udp.c             | 86 +++++++++++++++++++++++++++-------------------
 net/ipv6/udp.c             | 52 ++++++++++++++--------------
 3 files changed, 93 insertions(+), 77 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 011de9a20ec6..5b72d97693f8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -520,22 +520,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
-	{
-		.procname	= "udp_rmem_min",
-		.data		= &sysctl_udp_rmem_min,
-		.maxlen		= sizeof(sysctl_udp_rmem_min),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
-	},
-	{
-		.procname	= "udp_wmem_min",
-		.data		= &sysctl_udp_wmem_min,
-		.maxlen		= sizeof(sysctl_udp_wmem_min),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
-	},
 	{ }
 };
 
@@ -1167,6 +1151,22 @@ static struct ctl_table ipv4_net_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one,
 	},
+	{
+		.procname	= "udp_rmem_min",
+		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_rmem_min),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one
+	},
+	{
+		.procname	= "udp_wmem_min",
+		.data		= &init_net.ipv4.sysctl_udp_wmem_min,
+		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_wmem_min),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one
+	},
 	{ }
 };
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3013404d0935..908fc02fb4f8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,12 +122,6 @@ EXPORT_SYMBOL(udp_table);
 long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 
-int sysctl_udp_rmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_rmem_min);
-
-int sysctl_udp_wmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_wmem_min);
-
 atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
@@ -2533,35 +2527,35 @@ int udp_abort(struct sock *sk, int err)
 EXPORT_SYMBOL_GPL(udp_abort);
 
 struct proto udp_prot = {
-	.name		   = "UDP",
-	.owner		   = THIS_MODULE,
-	.close		   = udp_lib_close,
-	.connect	   = ip4_datagram_connect,
-	.disconnect	   = udp_disconnect,
-	.ioctl		   = udp_ioctl,
-	.init		   = udp_init_sock,
-	.destroy	   = udp_destroy_sock,
-	.setsockopt	   = udp_setsockopt,
-	.getsockopt	   = udp_getsockopt,
-	.sendmsg	   = udp_sendmsg,
-	.recvmsg	   = udp_recvmsg,
-	.sendpage	   = udp_sendpage,
-	.release_cb	   = ip4_datagram_release_cb,
-	.hash		   = udp_lib_hash,
-	.unhash		   = udp_lib_unhash,
-	.rehash		   = udp_v4_rehash,
-	.get_port	   = udp_v4_get_port,
-	.memory_allocated  = &udp_memory_allocated,
-	.sysctl_mem	   = sysctl_udp_mem,
-	.sysctl_wmem	   = &sysctl_udp_wmem_min,
-	.sysctl_rmem	   = &sysctl_udp_rmem_min,
-	.obj_size	   = sizeof(struct udp_sock),
-	.h.udp_table	   = &udp_table,
+	.name			= "UDP",
+	.owner			= THIS_MODULE,
+	.close			= udp_lib_close,
+	.connect		= ip4_datagram_connect,
+	.disconnect		= udp_disconnect,
+	.ioctl			= udp_ioctl,
+	.init			= udp_init_sock,
+	.destroy		= udp_destroy_sock,
+	.setsockopt		= udp_setsockopt,
+	.getsockopt		= udp_getsockopt,
+	.sendmsg		= udp_sendmsg,
+	.recvmsg		= udp_recvmsg,
+	.sendpage		= udp_sendpage,
+	.release_cb		= ip4_datagram_release_cb,
+	.hash			= udp_lib_hash,
+	.unhash			= udp_lib_unhash,
+	.rehash			= udp_v4_rehash,
+	.get_port		= udp_v4_get_port,
+	.memory_allocated	= &udp_memory_allocated,
+	.sysctl_mem		= sysctl_udp_mem,
+	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_udp_rmem_min),
+	.obj_size		= sizeof(struct udp_sock),
+	.h.udp_table		= &udp_table,
 #ifdef CONFIG_COMPAT
-	.compat_setsockopt = compat_udp_setsockopt,
-	.compat_getsockopt = compat_udp_getsockopt,
+	.compat_setsockopt	= compat_udp_setsockopt,
+	.compat_getsockopt	= compat_udp_getsockopt,
 #endif
-	.diag_destroy	   = udp_abort,
+	.diag_destroy		= udp_abort,
 };
 EXPORT_SYMBOL(udp_prot);
 
@@ -2831,6 +2825,26 @@ u32 udp_flow_hashrnd(void)
 }
 EXPORT_SYMBOL(udp_flow_hashrnd);
 
+static void __udp_sysctl_init(struct net *net)
+{
+	net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM;
+	net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	net->ipv4.sysctl_udp_l3mdev_accept = 0;
+#endif
+}
+
+static int __net_init udp_sysctl_init(struct net *net)
+{
+	__udp_sysctl_init(net);
+	return 0;
+}
+
+static struct pernet_operations __net_initdata udp_sysctl_ops = {
+	.init       = udp_sysctl_init,
+};
+
 void __init udp_init(void)
 {
 	unsigned long limit;
@@ -2843,8 +2857,7 @@ void __init udp_init(void)
 	sysctl_udp_mem[1] = limit;
 	sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
 
-	sysctl_udp_rmem_min = SK_MEM_QUANTUM;
-	sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+	__udp_sysctl_init(&init_net);
 
 	/* 16 spinlocks per cpu */
 	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
@@ -2854,4 +2867,7 @@ void __init udp_init(void)
 		panic("UDP: failed to alloc udp_busylocks\n");
 	for (i = 0; i < (1U << udp_busylocks_log); i++)
 		spin_lock_init(udp_busylocks + i);
+
+	if (register_pernet_subsys(&udp_sysctl_ops))
+		panic("UDP: failed to init sysctl parameters.\n");
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 52e3ea0e6f50..ad30f5e31969 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1509,34 +1509,34 @@ void udp6_proc_exit(struct net *net)
 /* ------------------------------------------------------------------------ */
 
 struct proto udpv6_prot = {
-	.name		   = "UDPv6",
-	.owner		   = THIS_MODULE,
-	.close		   = udp_lib_close,
-	.connect	   = ip6_datagram_connect,
-	.disconnect	   = udp_disconnect,
-	.ioctl		   = udp_ioctl,
-	.init		   = udp_init_sock,
-	.destroy	   = udpv6_destroy_sock,
-	.setsockopt	   = udpv6_setsockopt,
-	.getsockopt	   = udpv6_getsockopt,
-	.sendmsg	   = udpv6_sendmsg,
-	.recvmsg	   = udpv6_recvmsg,
-	.release_cb	   = ip6_datagram_release_cb,
-	.hash		   = udp_lib_hash,
-	.unhash		   = udp_lib_unhash,
-	.rehash		   = udp_v6_rehash,
-	.get_port	   = udp_v6_get_port,
-	.memory_allocated  = &udp_memory_allocated,
-	.sysctl_mem	   = sysctl_udp_mem,
-	.sysctl_wmem	   = &sysctl_udp_wmem_min,
-	.sysctl_rmem	   = &sysctl_udp_rmem_min,
-	.obj_size	   = sizeof(struct udp6_sock),
-	.h.udp_table	   = &udp_table,
+	.name			= "UDPv6",
+	.owner			= THIS_MODULE,
+	.close			= udp_lib_close,
+	.connect		= ip6_datagram_connect,
+	.disconnect		= udp_disconnect,
+	.ioctl			= udp_ioctl,
+	.init			= udp_init_sock,
+	.destroy		= udpv6_destroy_sock,
+	.setsockopt		= udpv6_setsockopt,
+	.getsockopt		= udpv6_getsockopt,
+	.sendmsg		= udpv6_sendmsg,
+	.recvmsg		= udpv6_recvmsg,
+	.release_cb		= ip6_datagram_release_cb,
+	.hash			= udp_lib_hash,
+	.unhash			= udp_lib_unhash,
+	.rehash			= udp_v6_rehash,
+	.get_port		= udp_v6_get_port,
+	.memory_allocated	= &udp_memory_allocated,
+	.sysctl_mem		= sysctl_udp_mem,
+	.sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+	.sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
+	.obj_size		= sizeof(struct udp6_sock),
+	.h.udp_table		= &udp_table,
 #ifdef CONFIG_COMPAT
-	.compat_setsockopt = compat_udpv6_setsockopt,
-	.compat_getsockopt = compat_udpv6_getsockopt,
+	.compat_setsockopt	= compat_udpv6_setsockopt,
+	.compat_getsockopt	= compat_udpv6_getsockopt,
 #endif
-	.diag_destroy      = udp_abort,
+	.diag_destroy		= udp_abort,
 };
 
 static struct inet_protosw udpv6_protosw = {
-- 
cgit v1.2.3


From 79ffdfc6522ae33d8a33e971070c08ee5f27439b Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 14 Mar 2018 22:17:20 +0300
Subject: net: Add rtnl_lock_killable()

rtnl_lock() is widely used mutex in kernel. Some of kernel code
does memory allocations under it. In case of memory deficit this
may invoke OOM killer, but the problem is a killed task can't
exit if it's waiting for the mutex. This may be a reason of deadlock
and panic.

This patch adds a new primitive, which responds on SIGKILL, and
it allows to use it in the places, where we don't want to sleep
forever.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 67f375cfb982..87079eaa871b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -75,6 +75,12 @@ void rtnl_lock(void)
 }
 EXPORT_SYMBOL(rtnl_lock);
 
+int rtnl_lock_killable(void)
+{
+	return mutex_lock_killable(&rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_lock_killable);
+
 static struct sk_buff *defer_kfree_skb_list;
 void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
 {
-- 
cgit v1.2.3


From b0f3debc9a1284d6b861e3f7cce0d119e6cd601d Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Wed, 14 Mar 2018 22:17:28 +0300
Subject: net: Use rtnl_lock_killable() in register_netdev()

This patch adds rtnl_lock_killable() to one of hot path
using rtnl_lock().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 12a9aad0b057..d8887cc38e7b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8018,7 +8018,8 @@ int register_netdev(struct net_device *dev)
 {
 	int err;
 
-	rtnl_lock();
+	if (rtnl_lock_killable())
+		return -EINTR;
 	err = register_netdevice(dev);
 	rtnl_unlock();
 	return err;
-- 
cgit v1.2.3


From c246d942eabc3288f5d93930663411070093ac52 Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Fri, 16 Mar 2018 15:06:39 +0100
Subject: net/smc: restructure netinfo for CLC proposal msgs

Introduce functions smc_clc_prfx_set to retrieve IP information for the
CLC proposal msg and smc_clc_prfx_match to match the contents of a
proposal message against the IP addresses of the net device. The new
functions replace the functionality provided by smc_clc_netinfo_by_tcpsk,
which is removed by this patch. The match functionality is extended to
scan all ipv4 addresses of the net device for a match against the
ipv4 subnet from the proposal msg.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c  |  14 ++------
 net/smc/smc_clc.c | 100 +++++++++++++++++++++++++++++++++++++++++-------------
 net/smc/smc_clc.h |   4 +--
 3 files changed, 82 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 649489f825a5..949a2714a453 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -767,8 +767,6 @@ static void smc_listen_work(struct work_struct *work)
 	struct smc_link *link;
 	int reason_code = 0;
 	int rc = 0;
-	__be32 subnet;
-	u8 prefix_len;
 	u8 ibport;
 
 	/* check if peer is smc capable */
@@ -803,17 +801,11 @@ static void smc_listen_work(struct work_struct *work)
 		goto decline_rdma;
 	}
 
-	/* determine subnet and mask from internal TCP socket */
-	rc = smc_clc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
-	if (rc) {
-		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
-		goto decline_rdma;
-	}
-
 	pclc = (struct smc_clc_msg_proposal *)&buf;
 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
-	if (pclc_prfx->outgoing_subnet != subnet ||
-	    pclc_prfx->prefix_len != prefix_len) {
+
+	rc = smc_clc_prfx_match(newclcsock, pclc_prfx);
+	if (rc) {
 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
 		goto decline_rdma;
 	}
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 874c5a75d6dd..dc3a2235978d 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -74,15 +74,35 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
 	return true;
 }
 
-/* determine subnet and mask of internal TCP socket */
-int smc_clc_netinfo_by_tcpsk(struct socket *clcsock,
-			     __be32 *subnet, u8 *prefix_len)
+/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
+static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
+				 struct smc_clc_msg_proposal_prefix *prop)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
+
+	if (!in_dev)
+		return -ENODEV;
+	for_ifa(in_dev) {
+		if (!inet_ifa_match(ipv4, ifa))
+			continue;
+		prop->prefix_len = inet_mask_len(ifa->ifa_mask);
+		prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
+		/* prop->ipv6_prefixes_cnt = 0; already done by memset before */
+		return 0;
+	} endfor_ifa(in_dev);
+	return -ENOENT;
+}
+
+/* retrieve and set prefixes in CLC proposal msg */
+static int smc_clc_prfx_set(struct socket *clcsock,
+			    struct smc_clc_msg_proposal_prefix *prop)
 {
 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
-	struct in_device *in_dev;
-	struct sockaddr_in addr;
+	struct sockaddr_storage addrs;
+	struct sockaddr_in *addr;
 	int rc = -ENOENT;
 
+	memset(prop, 0, sizeof(*prop));
 	if (!dst) {
 		rc = -ENOTCONN;
 		goto out;
@@ -91,22 +111,58 @@ int smc_clc_netinfo_by_tcpsk(struct socket *clcsock,
 		rc = -ENODEV;
 		goto out_rel;
 	}
-
 	/* get address to which the internal TCP socket is bound */
-	kernel_getsockname(clcsock, (struct sockaddr *)&addr);
-	/* analyze IPv4 specific data of net_device belonging to TCP socket */
+	kernel_getsockname(clcsock, (struct sockaddr *)&addrs);
+	/* analyze IP specific data of net_device belonging to TCP socket */
 	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(dst->dev);
+	if (addrs.ss_family == PF_INET) {
+		/* IPv4 */
+		addr = (struct sockaddr_in *)&addrs;
+		rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
+	}
+	rcu_read_unlock();
+out_rel:
+	dst_release(dst);
+out:
+	return rc;
+}
+
+/* match ipv4 addrs of dev against addr in CLC proposal */
+static int smc_clc_prfx_match4_rcu(struct net_device *dev,
+				   struct smc_clc_msg_proposal_prefix *prop)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+	if (!in_dev)
+		return -ENODEV;
 	for_ifa(in_dev) {
-		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
-			continue;
-		*prefix_len = inet_mask_len(ifa->ifa_mask);
-		*subnet = ifa->ifa_address & ifa->ifa_mask;
-		rc = 0;
-		break;
+		if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
+		    inet_ifa_match(prop->outgoing_subnet, ifa))
+			return 0;
 	} endfor_ifa(in_dev);
-	rcu_read_unlock();
 
+	return -ENOENT;
+}
+
+/* check if proposed prefixes match one of our device prefixes */
+int smc_clc_prfx_match(struct socket *clcsock,
+		       struct smc_clc_msg_proposal_prefix *prop)
+{
+	struct dst_entry *dst = sk_dst_get(clcsock->sk);
+	int rc = -ENOENT;
+
+	if (!dst) {
+		rc = -ENOTCONN;
+		goto out;
+	}
+	if (!dst->dev) {
+		rc = -ENODEV;
+		goto out_rel;
+	}
+	rcu_read_lock();
+	if (!prop->ipv6_prefixes_cnt)
+		rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
+	rcu_read_unlock();
 out_rel:
 	dst_release(dst);
 out:
@@ -240,6 +296,11 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 	struct msghdr msg;
 	int len, plen, rc;
 
+	/* retrieve ip prefixes for CLC proposal msg */
+	rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx);
+	if (rc)
+		return SMC_CLC_DECL_CNFERR; /* configuration error */
+
 	/* send SMC Proposal CLC message */
 	plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl);
 	memset(&pclc, 0, sizeof(pclc));
@@ -252,13 +313,6 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
 	pclc.iparea_offset = htons(0);
 
-	memset(&pclc_prfx, 0, sizeof(pclc_prfx));
-	/* determine subnet and mask from internal TCP socket */
-	rc = smc_clc_netinfo_by_tcpsk(smc->clcsock, &pclc_prfx.outgoing_subnet,
-				      &pclc_prfx.prefix_len);
-	if (rc)
-		return SMC_CLC_DECL_CNFERR; /* configuration error */
-	pclc_prfx.ipv6_prefixes_cnt = 0;
 	memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	memset(&msg, 0, sizeof(msg));
 	vec[0].iov_base = &pclc;
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 20e048beac30..5ddb0d22eb8a 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -122,8 +122,8 @@ smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
 	       ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset));
 }
 
-int smc_clc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
-			     u8 *prefix_len);
+int smc_clc_prfx_match(struct socket *clcsock,
+		       struct smc_clc_msg_proposal_prefix *prop);
 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 		     u8 expected_type);
 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
-- 
cgit v1.2.3


From 1a26d0201d7670fd6c9086e15504911ce240e6ff Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Fri, 16 Mar 2018 15:06:40 +0100
Subject: net/smc: add ipv6 support to CLC layer

The CLC layer is updated to support ipv6 proposal messages from peers and
to match incoming proposal messages against the ipv6 addresses of the net
device. struct smc_clc_ipv6_prefix is updated to provide the space for an
ipv6 address (struct was not used before). SMC_CLC_MAX_LEN is updated to
include the size of the proposal prefix. Existing code in net is not
affected, the previous SMC_CLC_MAX_LEN value is large enough to hold ipv4
proposal messages.

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/smc_clc.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++-------
 net/smc/smc_clc.h |  13 +++++--
 2 files changed, 105 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index dc3a2235978d..64fbc3230e6c 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -5,7 +5,7 @@
  *  CLC (connection layer control) handshake over initial TCP socket to
  *  prepare for RDMA traffic
  *
- *  Copyright IBM Corp. 2016
+ *  Copyright IBM Corp. 2016, 2018
  *
  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  */
@@ -15,6 +15,7 @@
 #include <linux/if_ether.h>
 #include <linux/sched/signal.h>
 
+#include <net/addrconf.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 
@@ -93,12 +94,44 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
 	return -ENOENT;
 }
 
+/* fill CLC proposal msg with ipv6 prefixes from device */
+static int smc_clc_prfx_set6_rcu(struct dst_entry *dst,
+				 struct smc_clc_msg_proposal_prefix *prop,
+				 struct smc_clc_ipv6_prefix *ipv6_prfx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev = __in6_dev_get(dst->dev);
+	struct inet6_ifaddr *ifa;
+	int cnt = 0;
+
+	if (!in6_dev)
+		return -ENODEV;
+	/* use a maximum of 8 IPv6 prefixes from device */
+	list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+		if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+			continue;
+		ipv6_addr_prefix(&ipv6_prfx[cnt].prefix,
+				 &ifa->addr, ifa->prefix_len);
+		ipv6_prfx[cnt].prefix_len = ifa->prefix_len;
+		cnt++;
+		if (cnt == SMC_CLC_MAX_V6_PREFIX)
+			break;
+	}
+	prop->ipv6_prefixes_cnt = cnt;
+	if (cnt)
+		return 0;
+#endif
+	return -ENOENT;
+}
+
 /* retrieve and set prefixes in CLC proposal msg */
 static int smc_clc_prfx_set(struct socket *clcsock,
-			    struct smc_clc_msg_proposal_prefix *prop)
+			    struct smc_clc_msg_proposal_prefix *prop,
+			    struct smc_clc_ipv6_prefix *ipv6_prfx)
 {
 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
 	struct sockaddr_storage addrs;
+	struct sockaddr_in6 *addr6;
 	struct sockaddr_in *addr;
 	int rc = -ENOENT;
 
@@ -114,11 +147,19 @@ static int smc_clc_prfx_set(struct socket *clcsock,
 	/* get address to which the internal TCP socket is bound */
 	kernel_getsockname(clcsock, (struct sockaddr *)&addrs);
 	/* analyze IP specific data of net_device belonging to TCP socket */
+	addr6 = (struct sockaddr_in6 *)&addrs;
 	rcu_read_lock();
 	if (addrs.ss_family == PF_INET) {
 		/* IPv4 */
 		addr = (struct sockaddr_in *)&addrs;
 		rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
+	} else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) {
+		/* mapped IPv4 address - peer is IPv4 only */
+		rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3],
+					   prop);
+	} else {
+		/* IPv6 */
+		rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx);
 	}
 	rcu_read_unlock();
 out_rel:
@@ -144,12 +185,41 @@ static int smc_clc_prfx_match4_rcu(struct net_device *dev,
 	return -ENOENT;
 }
 
+/* match ipv6 addrs of dev against addrs in CLC proposal */
+static int smc_clc_prfx_match6_rcu(struct net_device *dev,
+				   struct smc_clc_msg_proposal_prefix *prop)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev = __in6_dev_get(dev);
+	struct smc_clc_ipv6_prefix *ipv6_prfx;
+	struct inet6_ifaddr *ifa;
+	int i, max;
+
+	if (!in6_dev)
+		return -ENODEV;
+	/* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */
+	ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop));
+	max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX);
+	list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
+		if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
+			continue;
+		for (i = 0; i < max; i++) {
+			if (ifa->prefix_len == ipv6_prfx[i].prefix_len &&
+			    ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix,
+					      ifa->prefix_len))
+				return 0;
+		}
+	}
+#endif
+	return -ENOENT;
+}
+
 /* check if proposed prefixes match one of our device prefixes */
 int smc_clc_prfx_match(struct socket *clcsock,
 		       struct smc_clc_msg_proposal_prefix *prop)
 {
 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
-	int rc = -ENOENT;
+	int rc;
 
 	if (!dst) {
 		rc = -ENOTCONN;
@@ -162,6 +232,8 @@ int smc_clc_prfx_match(struct socket *clcsock,
 	rcu_read_lock();
 	if (!prop->ipv6_prefixes_cnt)
 		rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
+	else
+		rc = smc_clc_prfx_match6_rcu(dst->dev, prop);
 	rcu_read_unlock();
 out_rel:
 	dst_release(dst);
@@ -288,21 +360,24 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 			  struct smc_ib_device *smcibdev,
 			  u8 ibport)
 {
+	struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
 	struct smc_clc_msg_proposal_prefix pclc_prfx;
 	struct smc_clc_msg_proposal pclc;
 	struct smc_clc_msg_trail trl;
+	int len, i, plen, rc;
 	int reason_code = 0;
-	struct kvec vec[3];
+	struct kvec vec[4];
 	struct msghdr msg;
-	int len, plen, rc;
 
 	/* retrieve ip prefixes for CLC proposal msg */
-	rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx);
+	rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx);
 	if (rc)
 		return SMC_CLC_DECL_CNFERR; /* configuration error */
 
 	/* send SMC Proposal CLC message */
-	plen = sizeof(pclc) + sizeof(pclc_prfx) + sizeof(trl);
+	plen = sizeof(pclc) + sizeof(pclc_prfx) +
+	       (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) +
+	       sizeof(trl);
 	memset(&pclc, 0, sizeof(pclc));
 	memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	pclc.hdr.type = SMC_CLC_PROPOSAL;
@@ -315,14 +390,20 @@ int smc_clc_send_proposal(struct smc_sock *smc,
 
 	memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	memset(&msg, 0, sizeof(msg));
-	vec[0].iov_base = &pclc;
-	vec[0].iov_len = sizeof(pclc);
-	vec[1].iov_base = &pclc_prfx;
-	vec[1].iov_len = sizeof(pclc_prfx);
-	vec[2].iov_base = &trl;
-	vec[2].iov_len = sizeof(trl);
+	i = 0;
+	vec[i].iov_base = &pclc;
+	vec[i++].iov_len = sizeof(pclc);
+	vec[i].iov_base = &pclc_prfx;
+	vec[i++].iov_len = sizeof(pclc_prfx);
+	if (pclc_prfx.ipv6_prefixes_cnt > 0) {
+		vec[i].iov_base = &ipv6_prfx[0];
+		vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt *
+				   sizeof(ipv6_prfx[0]);
+	}
+	vec[i].iov_base = &trl;
+	vec[i++].iov_len = sizeof(trl);
 	/* due to the few bytes needed for clc-handshake this cannot block */
-	len = kernel_sendmsg(smc->clcsock, &msg, vec, 3, plen);
+	len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
 	if (len < sizeof(pclc)) {
 		if (len >= 0) {
 			reason_code = -ENETUNREACH;
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 5ddb0d22eb8a..63bf1dc2c1f9 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -60,10 +60,15 @@ struct smc_clc_msg_local {	/* header2 of clc messages */
 	u8 mac[6];		/* mac of ib_device port */
 };
 
+#define SMC_CLC_MAX_V6_PREFIX	8
+
+/* Struct would be 4 byte aligned, but it is used in an array that is sent
+ * to peers and must conform to RFC7609, hence we need to use packed here.
+ */
 struct smc_clc_ipv6_prefix {
-	u8 prefix[4];
+	struct in6_addr prefix;
 	u8 prefix_len;
-} __packed;
+} __packed;			/* format defined in RFC7609 */
 
 struct smc_clc_msg_proposal_prefix {	/* prefix part of clc proposal message*/
 	__be32 outgoing_subnet;	/* subnet mask */
@@ -79,9 +84,11 @@ struct smc_clc_msg_proposal {	/* clc proposal message sent by Linux */
 } __aligned(4);
 
 #define SMC_CLC_PROPOSAL_MAX_OFFSET	0x28
-#define SMC_CLC_PROPOSAL_MAX_PREFIX	(8 * sizeof(struct smc_clc_ipv6_prefix))
+#define SMC_CLC_PROPOSAL_MAX_PREFIX	(SMC_CLC_MAX_V6_PREFIX * \
+					 sizeof(struct smc_clc_ipv6_prefix))
 #define SMC_CLC_MAX_LEN		(sizeof(struct smc_clc_msg_proposal) + \
 				 SMC_CLC_PROPOSAL_MAX_OFFSET + \
+				 sizeof(struct smc_clc_msg_proposal_prefix) + \
 				 SMC_CLC_PROPOSAL_MAX_PREFIX + \
 				 sizeof(struct smc_clc_msg_trail))
 
-- 
cgit v1.2.3


From aaa4d33f6da1e1a415f4f7c299a97defd845ce7d Mon Sep 17 00:00:00 2001
From: Karsten Graul <kgraul@linux.vnet.ibm.com>
Date: Fri, 16 Mar 2018 15:06:41 +0100
Subject: net/smc: enable ipv6 support for smc

Add ipv6 support to the smc socket layer functions. Make use of the
updated clc layer functions to retrieve and match ipv6 information.
The indicator for ipv4 or ipv6 is the protocol constant that is provided
in the socket() call with address family AF_SMC.

Based-on-patch-by: Takanori Ueda <tkueda@jp.ibm.com>

Signed-off-by: Karsten Graul <kgraul@linux.vnet.ibm.com>
Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/smc/af_smc.c | 64 ++++++++++++++++++++++++++++++++++++++++++--------------
 net/smc/smc.h    |  4 +++-
 2 files changed, 51 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 949a2714a453..86913eb5cfa0 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -7,12 +7,11 @@
  *  applicable with RoCE-cards only
  *
  *  Initial restrictions:
- *    - IPv6 support postponed
  *    - support for alternate links postponed
  *    - partial support for non-blocking sockets only
  *    - support for urgent data postponed
  *
- *  Copyright IBM Corp. 2016
+ *  Copyright IBM Corp. 2016, 2018
  *
  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
  *              based on prototype from Frank Blaschka
@@ -64,6 +63,10 @@ static struct smc_hashinfo smc_v4_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
 };
 
+static struct smc_hashinfo smc_v6_hashinfo = {
+	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
+};
+
 int smc_hash_sk(struct sock *sk)
 {
 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
@@ -103,6 +106,18 @@ struct proto smc_proto = {
 };
 EXPORT_SYMBOL_GPL(smc_proto);
 
+struct proto smc_proto6 = {
+	.name		= "SMC6",
+	.owner		= THIS_MODULE,
+	.keepalive	= smc_set_keepalive,
+	.hash		= smc_hash_sk,
+	.unhash		= smc_unhash_sk,
+	.obj_size	= sizeof(struct smc_sock),
+	.h.smc_hash	= &smc_v6_hashinfo,
+	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto6);
+
 static int smc_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -159,19 +174,22 @@ static void smc_destruct(struct sock *sk)
 	sk_refcnt_debug_dec(sk);
 }
 
-static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
+				   int protocol)
 {
 	struct smc_sock *smc;
+	struct proto *prot;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
+	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
 	if (!sk)
 		return NULL;
 
 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
 	sk->sk_state = SMC_INIT;
 	sk->sk_destruct = smc_destruct;
-	sk->sk_protocol = SMCPROTO_SMC;
+	sk->sk_protocol = protocol;
 	smc = smc_sk(sk);
 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
 	INIT_LIST_HEAD(&smc->accept_q);
@@ -198,10 +216,13 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
 		goto out;
 
 	rc = -EAFNOSUPPORT;
+	if (addr->sin_family != AF_INET &&
+	    addr->sin_family != AF_INET6 &&
+	    addr->sin_family != AF_UNSPEC)
+		goto out;
 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
-	if ((addr->sin_family != AF_INET) &&
-	    ((addr->sin_family != AF_UNSPEC) ||
-	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+	if (addr->sin_family == AF_UNSPEC &&
+	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
 		goto out;
 
 	lock_sock(sk);
@@ -529,7 +550,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr,
 	/* separate smc parameter checking to be safe */
 	if (alen < sizeof(addr->sa_family))
 		goto out_err;
-	if (addr->sa_family != AF_INET)
+	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
 		goto out_err;
 
 	lock_sock(sk);
@@ -571,7 +592,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
 	int rc;
 
 	release_sock(lsk);
-	new_sk = smc_sock_alloc(sock_net(lsk), NULL);
+	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
 	if (!new_sk) {
 		rc = -ENOMEM;
 		lsk->sk_err = ENOMEM;
@@ -1367,6 +1388,7 @@ static const struct proto_ops smc_sock_ops = {
 static int smc_create(struct net *net, struct socket *sock, int protocol,
 		      int kern)
 {
+	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
 	struct smc_sock *smc;
 	struct sock *sk;
 	int rc;
@@ -1376,20 +1398,20 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -EPROTONOSUPPORT;
-	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
 		goto out;
 
 	rc = -ENOBUFS;
 	sock->ops = &smc_sock_ops;
-	sk = smc_sock_alloc(net, sock);
+	sk = smc_sock_alloc(net, sock, protocol);
 	if (!sk)
 		goto out;
 
 	/* create internal TCP socket for CLC handshake and fallback */
 	smc = smc_sk(sk);
 	smc->use_fallback = false; /* assume rdma capability first */
-	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
-			      IPPROTO_TCP, &smc->clcsock);
+	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
+			      &smc->clcsock);
 	if (rc) {
 		sk_common_release(sk);
 		goto out;
@@ -1429,16 +1451,23 @@ static int __init smc_init(void)
 
 	rc = proto_register(&smc_proto, 1);
 	if (rc) {
-		pr_err("%s: proto_register fails with %d\n", __func__, rc);
+		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
 		goto out_pnet;
 	}
 
+	rc = proto_register(&smc_proto6, 1);
+	if (rc) {
+		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
+		goto out_proto;
+	}
+
 	rc = sock_register(&smc_sock_family_ops);
 	if (rc) {
 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
-		goto out_proto;
+		goto out_proto6;
 	}
 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
+	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
 
 	rc = smc_ib_register_client();
 	if (rc) {
@@ -1451,6 +1480,8 @@ static int __init smc_init(void)
 
 out_sock:
 	sock_unregister(PF_SMC);
+out_proto6:
+	proto_unregister(&smc_proto6);
 out_proto:
 	proto_unregister(&smc_proto);
 out_pnet:
@@ -1475,6 +1506,7 @@ static void __exit smc_exit(void)
 	static_branch_disable(&tcp_have_smc);
 	smc_ib_unregister_client();
 	sock_unregister(PF_SMC);
+	proto_unregister(&smc_proto6);
 	proto_unregister(&smc_proto);
 	smc_pnet_exit();
 }
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 268cdf11533c..e4829a2f46ba 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -18,11 +18,13 @@
 
 #include "smc_ib.h"
 
-#define SMCPROTO_SMC		0	/* SMC protocol */
+#define SMCPROTO_SMC		0	/* SMC protocol, IPv4 */
+#define SMCPROTO_SMC6		1	/* SMC protocol, IPv6 */
 
 #define SMC_MAX_PORTS		2	/* Max # of ports */
 
 extern struct proto smc_proto;
+extern struct proto smc_proto6;
 
 #ifdef ATOMIC64_INIT
 #define KERNEL_HAS_ATOMIC64
-- 
cgit v1.2.3


From 7156d194a0772f733865267e7207e0b08f81b02b Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Fri, 16 Mar 2018 10:51:07 -0700
Subject: tcp: add snd_ssthresh stat in SCM_TIMESTAMPING_OPT_STATS

This patch adds TCP_NLA_SND_SSTHRESH stat into SCM_TIMESTAMPING_OPT_STATS
that reports tcp_sock.snd_ssthresh.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fb350f740f69..e553f84bde83 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3031,7 +3031,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 	u32 rate;
 
 	stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
-			  4 * nla_total_size(sizeof(u32)) +
+			  5 * nla_total_size(sizeof(u32)) +
 			  3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
 	if (!stats)
 		return NULL;
@@ -3061,6 +3061,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
 
 	nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
 	nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
+	nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
 
 	nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
 	nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
-- 
cgit v1.2.3


From 53794570049d314742f156c99022914840a3d5ab Mon Sep 17 00:00:00 2001
From: Yousuk Seung <ysseung@google.com>
Date: Fri, 16 Mar 2018 10:51:49 -0700
Subject: net-tcp_bbr: set tp->snd_ssthresh to BDP upon STARTUP exit

Set tp->snd_ssthresh to BDP upon STARTUP exit. This allows us
to check if a BBR flow exited STARTUP and the BDP at the
time of STARTUP exit with SCM_TIMESTAMPING_OPT_STATS. Since BBR does not
use snd_ssthresh this fix has no impact on BBR's behavior.

Signed-off-by: Yousuk Seung <ysseung@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bbr.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index c92014cb1e16..158d105e76da 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -731,6 +731,8 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
 		bbr->mode = BBR_DRAIN;	/* drain queue we created */
 		bbr->pacing_gain = bbr_drain_gain;	/* pace slow to drain */
 		bbr->cwnd_gain = bbr_high_gain;	/* maintain cwnd */
+		tcp_sk(sk)->snd_ssthresh =
+				bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
 	}	/* fall through to check if in-flight is already small: */
 	if (bbr->mode == BBR_DRAIN &&
 	    tcp_packets_in_flight(tcp_sk(sk)) <=
@@ -834,6 +836,7 @@ static void bbr_init(struct sock *sk)
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	bbr->prior_cwnd = 0;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	bbr->rtt_cnt = 0;
 	bbr->next_rtt_delivered = 0;
 	bbr->prev_ca_state = TCP_CA_Open;
@@ -886,7 +889,7 @@ static u32 bbr_undo_cwnd(struct sock *sk)
 static u32 bbr_ssthresh(struct sock *sk)
 {
 	bbr_save_cwnd(sk);
-	return TCP_INFINITE_SSTHRESH;	 /* BBR does not use ssthresh */
+	return tcp_sk(sk)->snd_ssthresh;
 }
 
 static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
-- 
cgit v1.2.3


From 489b30b53f0540b9f8e391cbb2839cea48b5d1c1 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 15 Mar 2018 12:10:57 +0300
Subject: net: Convert l2tp_net_ops

Init method is rather simple. Exit method queues del_work
for every tunnel from per-net list. This seems to be safe
to be marked async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Guillaume Nault <g.nault@alphalink.fr>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/l2tp/l2tp_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 83421c6f0bef..189a12a5e4ac 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1787,6 +1787,7 @@ static struct pernet_operations l2tp_net_ops = {
 	.exit = l2tp_exit_net,
 	.id   = &l2tp_net_id,
 	.size = sizeof(struct l2tp_net),
+	.async = true,
 };
 
 static int __init l2tp_init(void)
-- 
cgit v1.2.3


From 8cec2f49dc413d6328067d22862b0bdd0f4305ec Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 15 Mar 2018 12:11:06 +0300
Subject: net: Convert mpls_net_ops

These pernet_operations register and unregister sysctl table.
Exit methods frees platform_labels from net::mpls::platform_label.
Everything is per-net, and they looks safe to be marked async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/mpls/af_mpls.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..d4a89a8be013 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2488,6 +2488,7 @@ static void mpls_net_exit(struct net *net)
 static struct pernet_operations mpls_net_ops = {
 	.init = mpls_net_init,
 	.exit = mpls_net_exit,
+	.async = true,
 };
 
 static struct rtnl_af_ops mpls_af_ops __read_mostly = {
-- 
cgit v1.2.3


From ec716650a750334cd763024597159eea3569e207 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 15 Mar 2018 12:11:16 +0300
Subject: net: Convert ovs_net_ops

These pernet_operations initialize and destroy net_generic()
data pointed by ovs_net_id. Exit method destroys vports from
alive net to exiting net. Since they are only pernet_operations
interested in this data, and exit method is executed under
exclusive global lock (ovs_mutex), they are safe to be executed
in parallel.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ef38e5aecd28..100191df0371 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2384,6 +2384,7 @@ static struct pernet_operations ovs_net_ops = {
 	.exit = ovs_exit_net,
 	.id   = &ovs_net_id,
 	.size = sizeof(struct ovs_net),
+	.async = true,
 };
 
 static int __init dp_init(void)
-- 
cgit v1.2.3


From 554855ccdf37262def7fed557a5efec39f2c0f47 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 15 Mar 2018 12:11:25 +0300
Subject: net: Convert ipvs_core_ops

These pernet_operations register and unregister nf hooks,
/proc entries, sysctl, percpu statistics. There are several
global lists, and the only list modified without exclusive
locks is ip_vs_conn_tab in ip_vs_conn_flush(). We iterate
the list and force the timers expire at the moment. Since
there were possible several timer expirations before this
patch, and since they are safe, the patch does not invent
new parallelism of their destruction. These pernet_operations
look safe to be converted.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/ipvs/ip_vs_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 5f6f73cf2174..c5d16e2bc8e2 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2289,6 +2289,7 @@ static struct pernet_operations ipvs_core_ops = {
 	.exit = __ip_vs_cleanup,
 	.id   = &ip_vs_net_id,
 	.size = sizeof(struct netns_ipvs),
+	.async = true,
 };
 
 static struct pernet_operations ipvs_core_dev_ops = {
-- 
cgit v1.2.3


From d0edfbb4ba4a88399c169cf2c26b252d39f503bc Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 15 Mar 2018 12:11:35 +0300
Subject: net: Convert ipvs_core_dev_ops

Exit method stops two per-net threads and cancels
delayed work. Everything looks nicely per-net divided.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/ipvs/ip_vs_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index c5d16e2bc8e2..6a6cb9db030b 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2294,6 +2294,7 @@ static struct pernet_operations ipvs_core_ops = {
 
 static struct pernet_operations ipvs_core_dev_ops = {
 	.exit = __ip_vs_dev_cleanup,
+	.async = true,
 };
 
 /*
-- 
cgit v1.2.3


From 6c77e79557acd9b3b896a8075a19ef11ed887a99 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 15 Mar 2018 12:11:44 +0300
Subject: net: Convert ip_vs_ftp_ops

These pernet_operations register and unregister ipvs app.
register_ip_vs_app(), unregister_ip_vs_app() and
register_ip_vs_app_inc() modify per-net structures,
and there are no global structures touched. So,
this looks safe to be marked as async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/ipvs/ip_vs_ftp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..8b25aab41928 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -479,6 +479,7 @@ static void __ip_vs_ftp_exit(struct net *net)
 static struct pernet_operations ip_vs_ftp_ops = {
 	.init = __ip_vs_ftp_init,
 	.exit = __ip_vs_ftp_exit,
+	.async = true,
 };
 
 static int __init ip_vs_ftp_init(void)
-- 
cgit v1.2.3


From 928df1880e24bcd47d6359ff86df24db3dfba3c3 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Mar 2018 16:48:51 +0100
Subject: tipc: obsolete TIPC_ZONE_SCOPE

Publications for TIPC_CLUSTER_SCOPE and TIPC_ZONE_SCOPE are in all
aspects handled the same way, both on the publishing node and on the
receiving nodes.

Despite previous ambitions to the contrary, this is never going to change,
so we take the conseqeunce of this and obsolete TIPC_ZONE_SCOPE and related
macros/functions. Whenever a user is doing a bind() or a sendmsg() attempt
using ZONE_SCOPE we translate this internally to CLUSTER_SCOPE, while we
remain compatible with users and remote nodes still using ZONE_SCOPE.

Furthermore, the non-formalized scope value 0 has always been permitted
for use during lookup, with the same meaning as ZONE_SCOPE/CLUSTER_SCOPE.
We now permit it even as binding scope, but for compatibility reasons we
choose to not change the value of TIPC_CLUSTER_SCOPE.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/addr.c       | 31 -------------------------------
 net/tipc/addr.h       | 10 ++++++++++
 net/tipc/msg.c        |  2 +-
 net/tipc/name_table.c |  3 +--
 net/tipc/net.c        |  2 +-
 net/tipc/socket.c     | 15 ++++++++++-----
 6 files changed, 23 insertions(+), 40 deletions(-)

(limited to 'net')

diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 48fd3b5a73fb..97cd857d7f43 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -63,23 +63,6 @@ int in_own_node(struct net *net, u32 addr)
 	return (addr == tn->own_addr) || !addr;
 }
 
-/**
- * addr_domain - convert 2-bit scope value to equivalent message lookup domain
- *
- * Needed when address of a named message must be looked up a second time
- * after a network hop.
- */
-u32 addr_domain(struct net *net, u32 sc)
-{
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-
-	if (likely(sc == TIPC_NODE_SCOPE))
-		return tn->own_addr;
-	if (sc == TIPC_CLUSTER_SCOPE)
-		return tipc_cluster_mask(tn->own_addr);
-	return tipc_zone_mask(tn->own_addr);
-}
-
 /**
  * tipc_addr_domain_valid - validates a network domain address
  *
@@ -124,20 +107,6 @@ int tipc_in_scope(u32 domain, u32 addr)
 	return 0;
 }
 
-/**
- * tipc_addr_scope - convert message lookup domain to a 2-bit scope value
- */
-int tipc_addr_scope(u32 domain)
-{
-	if (likely(!domain))
-		return TIPC_ZONE_SCOPE;
-	if (tipc_node(domain))
-		return TIPC_NODE_SCOPE;
-	if (tipc_cluster(domain))
-		return TIPC_CLUSTER_SCOPE;
-	return TIPC_ZONE_SCOPE;
-}
-
 char *tipc_addr_string_fill(char *string, u32 addr)
 {
 	snprintf(string, 16, "<%u.%u.%u>",
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index bebb347803ce..2ecf5a5d40dd 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -60,6 +60,16 @@ static inline u32 tipc_cluster_mask(u32 addr)
 	return addr & TIPC_ZONE_CLUSTER_MASK;
 }
 
+static inline int tipc_node2scope(u32 node)
+{
+	return node ? TIPC_NODE_SCOPE : TIPC_CLUSTER_SCOPE;
+}
+
+static inline int tipc_scope2node(struct net *net, int sc)
+{
+	return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net);
+}
+
 u32 tipc_own_addr(struct net *net);
 int in_own_cluster(struct net *net, u32 addr);
 int in_own_cluster_exact(struct net *net, u32 addr);
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index 4e1c6f6450bb..b6c45dccba3d 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -580,7 +580,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)
 	msg = buf_msg(skb);
 	if (msg_reroute_cnt(msg))
 		return false;
-	dnode = addr_domain(net, msg_lookup_scope(msg));
+	dnode = tipc_scope2node(net, msg_lookup_scope(msg));
 	dport = tipc_nametbl_translate(net, msg_nametype(msg),
 				       msg_nameinst(msg), &dnode);
 	if (!dport)
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e01c9c691ba2..6772390fcb00 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -473,8 +473,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
 	struct name_seq *seq = nametbl_find_seq(net, type);
 	int index = hash(type);
 
-	if ((scope < TIPC_ZONE_SCOPE) || (scope > TIPC_NODE_SCOPE) ||
-	    (lower > upper)) {
+	if (scope > TIPC_NODE_SCOPE || lower > upper) {
 		pr_debug("Failed to publish illegal {%u,%u,%u} with scope %u\n",
 			 type, lower, upper, scope);
 		return NULL;
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 1a2fde0d6f61..5c4c4405b78e 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -118,7 +118,7 @@ int tipc_net_start(struct net *net, u32 addr)
 	tipc_sk_reinit(net);
 
 	tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr,
-			     TIPC_ZONE_SCOPE, 0, tn->own_addr);
+			     TIPC_CLUSTER_SCOPE, 0, tn->own_addr);
 
 	pr_info("Started in network mode\n");
 	pr_info("Own node address %s, network identity %u\n",
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 8b04e601311c..910d3827f499 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -644,7 +644,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr,
 		goto exit;
 	}
 
-	res = (addr->scope > 0) ?
+	res = (addr->scope >= 0) ?
 		tipc_sk_publish(tsk, addr->scope, &addr->addr.nameseq) :
 		tipc_sk_withdraw(tsk, -addr->scope, &addr->addr.nameseq);
 exit:
@@ -1280,8 +1280,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	struct tipc_msg *hdr = &tsk->phdr;
 	struct tipc_name_seq *seq;
 	struct sk_buff_head pkts;
-	u32 type, inst, domain;
 	u32 dnode, dport;
+	u32 type, inst;
 	int mtu, rc;
 
 	if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
@@ -1332,13 +1332,12 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	if (dest->addrtype == TIPC_ADDR_NAME) {
 		type = dest->addr.name.name.type;
 		inst = dest->addr.name.name.instance;
-		domain = dest->addr.name.domain;
-		dnode = domain;
+		dnode = dest->addr.name.domain;
 		msg_set_type(hdr, TIPC_NAMED_MSG);
 		msg_set_hdr_sz(hdr, NAMED_H_SIZE);
 		msg_set_nametype(hdr, type);
 		msg_set_nameinst(hdr, inst);
-		msg_set_lookup_scope(hdr, tipc_addr_scope(domain));
+		msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
 		dport = tipc_nametbl_translate(net, type, inst, &dnode);
 		msg_set_destnode(hdr, dnode);
 		msg_set_destport(hdr, dport);
@@ -2592,6 +2591,9 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
 	struct publication *publ;
 	u32 key;
 
+	if (scope != TIPC_NODE_SCOPE)
+		scope = TIPC_CLUSTER_SCOPE;
+
 	if (tipc_sk_connected(sk))
 		return -EINVAL;
 	key = tsk->portid + tsk->pub_count + 1;
@@ -2617,6 +2619,9 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 	struct publication *safe;
 	int rc = -EINVAL;
 
+	if (scope != TIPC_NODE_SCOPE)
+		scope = TIPC_CLUSTER_SCOPE;
+
 	list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) {
 		if (seq) {
 			if (publ->scope != scope)
-- 
cgit v1.2.3


From 64a52b26d5633d6efc35cdf1e0c627cc4189e55a Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Mar 2018 16:48:52 +0100
Subject: tipc: remove zone publication list in name table

As a consequence of the previous commit we nan now eliminate zone scope
related lists in the name table. We start with name_table::publ_list[3],
which can now be replaced with two lists, one for node scope publications
and one for cluster scope publications.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/core.h       |  5 +++++
 net/tipc/name_distr.c | 39 ++++++++++++++++++---------------------
 net/tipc/name_table.c |  5 ++---
 net/tipc/name_table.h |  6 ++++--
 4 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/tipc/core.h b/net/tipc/core.h
index ff8b071654f5..347f850dc872 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -131,6 +131,11 @@ static inline struct list_head *tipc_nodes(struct net *net)
 	return &tipc_net(net)->node_list;
 }
 
+static inline struct name_table *tipc_name_table(struct net *net)
+{
+	return tipc_net(net)->nametbl;
+}
+
 static inline struct tipc_topsrv *tipc_topsrv(struct net *net)
 {
 	return tipc_net(net)->topsrv;
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 23f8899e0f8c..11ce20505550 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -86,25 +86,25 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
  */
 struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct sk_buff *buf;
+	struct name_table *nt = tipc_name_table(net);
 	struct distr_item *item;
+	struct sk_buff *skb;
 
-	list_add_tail_rcu(&publ->local_list,
-			  &tn->nametbl->publ_list[publ->scope]);
-
-	if (publ->scope == TIPC_NODE_SCOPE)
+	if (publ->scope == TIPC_NODE_SCOPE) {
+		list_add_tail_rcu(&publ->local_list, &nt->node_scope);
 		return NULL;
+	}
+	list_add_tail_rcu(&publ->local_list, &nt->cluster_scope);
 
-	buf = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0);
-	if (!buf) {
+	skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0);
+	if (!skb) {
 		pr_warn("Publication distribution failure\n");
 		return NULL;
 	}
 
-	item = (struct distr_item *)msg_data(buf_msg(buf));
+	item = (struct distr_item *)msg_data(buf_msg(skb));
 	publ_to_item(item, publ);
-	return buf;
+	return skb;
 }
 
 /**
@@ -184,16 +184,13 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
  */
 void tipc_named_node_up(struct net *net, u32 dnode)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct name_table *nt = tipc_name_table(net);
 	struct sk_buff_head head;
 
 	__skb_queue_head_init(&head);
 
 	rcu_read_lock();
-	named_distribute(net, &head, dnode,
-			 &tn->nametbl->publ_list[TIPC_CLUSTER_SCOPE]);
-	named_distribute(net, &head, dnode,
-			 &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]);
+	named_distribute(net, &head, dnode, &nt->cluster_scope);
 	rcu_read_unlock();
 
 	tipc_node_xmit(net, &head, dnode, 0);
@@ -382,16 +379,16 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq)
  */
 void tipc_named_reinit(struct net *net)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct name_table *nt = tipc_name_table(net);
+	struct tipc_net *tn = tipc_net(net);
 	struct publication *publ;
-	int scope;
 
 	spin_lock_bh(&tn->nametbl_lock);
 
-	for (scope = TIPC_ZONE_SCOPE; scope <= TIPC_NODE_SCOPE; scope++)
-		list_for_each_entry_rcu(publ, &tn->nametbl->publ_list[scope],
-					local_list)
-			publ->node = tn->own_addr;
+	list_for_each_entry_rcu(publ, &nt->node_scope, local_list)
+		publ->node = tn->own_addr;
+	list_for_each_entry_rcu(publ, &nt->cluster_scope, local_list)
+		publ->node = tn->own_addr;
 
 	spin_unlock_bh(&tn->nametbl_lock);
 }
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 6772390fcb00..1a3a327018c5 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -878,9 +878,8 @@ int tipc_nametbl_init(struct net *net)
 	for (i = 0; i < TIPC_NAMETBL_SIZE; i++)
 		INIT_HLIST_HEAD(&tipc_nametbl->seq_hlist[i]);
 
-	INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_ZONE_SCOPE]);
-	INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_CLUSTER_SCOPE]);
-	INIT_LIST_HEAD(&tipc_nametbl->publ_list[TIPC_NODE_SCOPE]);
+	INIT_LIST_HEAD(&tipc_nametbl->node_scope);
+	INIT_LIST_HEAD(&tipc_nametbl->cluster_scope);
 	tn->nametbl = tipc_nametbl;
 	spin_lock_init(&tn->nametbl_lock);
 	return 0;
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 17652602d5e2..47f72cddfb39 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -88,12 +88,14 @@ struct publication {
 /**
  * struct name_table - table containing all existing port name publications
  * @seq_hlist: name sequence hash lists
- * @publ_list: pulication lists
+ * @node_scope: all local publications with node scope
+ * @cluster_scope: all local publications with cluster scope
  * @local_publ_count: number of publications issued by this node
  */
 struct name_table {
 	struct hlist_head seq_hlist[TIPC_NAMETBL_SIZE];
-	struct list_head publ_list[TIPC_PUBL_SCOPE_NUM];
+	struct list_head node_scope;
+	struct list_head cluster_scope;
 	u32 local_publ_count;
 };
 
-- 
cgit v1.2.3


From ba765ec63786583e210b55073a908a9d7ea284fa Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Mar 2018 16:48:53 +0100
Subject: tipc: remove zone_list member in struct publication

As a further consequence of the previous commits, we can also remove
the member 'zone_list 'in struct name_info and struct publication.
Instead, we now let the member cluster_list take over the role a
container of all publications of a given <type,lower, upper>.
We also remove the counters for the size of those lists, since
they don't serve any purpose.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 101 ++++++++++++++------------------------------------
 net/tipc/name_table.h |   5 +--
 2 files changed, 30 insertions(+), 76 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 1a3a327018c5..6d7b4c7ff4b7 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -50,24 +50,12 @@
 
 /**
  * struct name_info - name sequence publication info
- * @node_list: circular list of publications made by own node
- * @cluster_list: circular list of publications made by own cluster
- * @zone_list: circular list of publications made by own zone
- * @node_list_size: number of entries in "node_list"
- * @cluster_list_size: number of entries in "cluster_list"
- * @zone_list_size: number of entries in "zone_list"
- *
- * Note: The zone list always contains at least one entry, since all
- *       publications of the associated name sequence belong to it.
- *       (The cluster and node lists may be empty.)
+ * @node_list: list of publications on own node of this <type,lower,upper>
+ * @cluster_list: list of all publications of this <type,lower,upper>
  */
 struct name_info {
 	struct list_head node_list;
 	struct list_head cluster_list;
-	struct list_head zone_list;
-	u32 node_list_size;
-	u32 cluster_list_size;
-	u32 zone_list_size;
 };
 
 /**
@@ -249,7 +237,7 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 		info = sseq->info;
 
 		/* Check if an identical publication already exists */
-		list_for_each_entry(publ, &info->zone_list, zone_list) {
+		list_for_each_entry(publ, &info->cluster_list, cluster_list) {
 			if ((publ->ref == port) && (publ->key == key) &&
 			    (!publ->node || (publ->node == node)))
 				return NULL;
@@ -292,7 +280,6 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 
 		INIT_LIST_HEAD(&info->node_list);
 		INIT_LIST_HEAD(&info->cluster_list);
-		INIT_LIST_HEAD(&info->zone_list);
 
 		/* Insert new sub-sequence */
 		sseq = &nseq->sseqs[inspos];
@@ -311,18 +298,10 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 	if (!publ)
 		return NULL;
 
-	list_add(&publ->zone_list, &info->zone_list);
-	info->zone_list_size++;
-
-	if (in_own_cluster(net, node)) {
-		list_add(&publ->cluster_list, &info->cluster_list);
-		info->cluster_list_size++;
-	}
+	list_add(&publ->cluster_list, &info->cluster_list);
 
-	if (in_own_node(net, node)) {
+	if (in_own_node(net, node))
 		list_add(&publ->node_list, &info->node_list);
-		info->node_list_size++;
-	}
 
 	/* Any subscriptions waiting for notification?  */
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
@@ -363,7 +342,7 @@ static struct publication *tipc_nameseq_remove_publ(struct net *net,
 	info = sseq->info;
 
 	/* Locate publication, if it exists */
-	list_for_each_entry(publ, &info->zone_list, zone_list) {
+	list_for_each_entry(publ, &info->cluster_list, cluster_list) {
 		if ((publ->key == key) && (publ->ref == ref) &&
 		    (!publ->node || (publ->node == node)))
 			goto found;
@@ -371,24 +350,12 @@ static struct publication *tipc_nameseq_remove_publ(struct net *net,
 	return NULL;
 
 found:
-	/* Remove publication from zone scope list */
-	list_del(&publ->zone_list);
-	info->zone_list_size--;
-
-	/* Remove publication from cluster scope list, if present */
-	if (in_own_cluster(net, node)) {
-		list_del(&publ->cluster_list);
-		info->cluster_list_size--;
-	}
-
-	/* Remove publication from node scope list, if present */
-	if (in_own_node(net, node)) {
+	list_del(&publ->cluster_list);
+	if (in_own_node(net, node))
 		list_del(&publ->node_list);
-		info->node_list_size--;
-	}
 
 	/* Contract subseq list if no more publications for that subseq */
-	if (list_empty(&info->zone_list)) {
+	if (list_empty(&info->cluster_list)) {
 		kfree(info);
 		free = &nseq->sseqs[nseq->first_free--];
 		memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq));
@@ -435,7 +402,8 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 			struct name_info *info = sseq->info;
 			int must_report = 1;
 
-			list_for_each_entry(crs, &info->zone_list, zone_list) {
+			list_for_each_entry(crs, &info->cluster_list,
+					    cluster_list) {
 				tipc_sub_report_overlap(sub, sseq->lower,
 							sseq->upper,
 							TIPC_PUBLISHED,
@@ -559,18 +527,12 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 						node_list);
 			list_move_tail(&publ->node_list,
 				       &info->node_list);
-		} else if (!list_empty(&info->cluster_list)) {
+		} else {
 			publ = list_first_entry(&info->cluster_list,
 						struct publication,
 						cluster_list);
 			list_move_tail(&publ->cluster_list,
 				       &info->cluster_list);
-		} else {
-			publ = list_first_entry(&info->zone_list,
-						struct publication,
-						zone_list);
-			list_move_tail(&publ->zone_list,
-				       &info->zone_list);
 		}
 	}
 
@@ -581,16 +543,10 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 		publ = list_first_entry(&info->node_list, struct publication,
 					node_list);
 		list_move_tail(&publ->node_list, &info->node_list);
-	} else if (in_own_cluster_exact(net, *destnode)) {
-		if (list_empty(&info->cluster_list))
-			goto no_match;
+	} else {
 		publ = list_first_entry(&info->cluster_list, struct publication,
 					cluster_list);
 		list_move_tail(&publ->cluster_list, &info->cluster_list);
-	} else {
-		publ = list_first_entry(&info->zone_list, struct publication,
-					zone_list);
-		list_move_tail(&publ->zone_list, &info->zone_list);
 	}
 
 	ref = publ->ref;
@@ -622,7 +578,7 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
 	sseq = nameseq_find_subseq(seq, instance);
 	if (likely(sseq)) {
 		info = sseq->info;
-		list_for_each_entry(publ, &info->zone_list, zone_list) {
+		list_for_each_entry(publ, &info->cluster_list, cluster_list) {
 			if (publ->scope != scope)
 				continue;
 			if (publ->ref == exclude && publ->node == self)
@@ -631,7 +587,8 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
 			(*dstcnt)++;
 			if (all)
 				continue;
-			list_move_tail(&publ->zone_list, &info->zone_list);
+			list_move_tail(&publ->cluster_list,
+				       &info->cluster_list);
 			break;
 		}
 	}
@@ -641,15 +598,14 @@ exit:
 	return !list_empty(dsts);
 }
 
-int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
-			   u32 scope, bool exact, struct list_head *dports)
+void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
+			    u32 scope, bool exact, struct list_head *dports)
 {
 	struct sub_seq *sseq_stop;
 	struct name_info *info;
 	struct publication *p;
 	struct name_seq *seq;
 	struct sub_seq *sseq;
-	int res = 0;
 
 	rcu_read_lock();
 	seq = nametbl_find_seq(net, type);
@@ -667,14 +623,10 @@ int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
 			if (p->scope == scope || (!exact && p->scope < scope))
 				tipc_dest_push(dports, 0, p->ref);
 		}
-
-		if (info->cluster_list_size != info->node_list_size)
-			res = 1;
 	}
 	spin_unlock_bh(&seq->lock);
 exit:
 	rcu_read_unlock();
-	return res;
 }
 
 /* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
@@ -699,7 +651,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 	stop = seq->sseqs + seq->first_free;
 	for (; sseq != stop && sseq->lower <= upper; sseq++) {
 		info = sseq->info;
-		list_for_each_entry(publ, &info->zone_list, zone_list) {
+		list_for_each_entry(publ, &info->cluster_list, cluster_list) {
 			tipc_nlist_add(nodes, publ->node);
 		}
 	}
@@ -728,7 +680,7 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 	stop = seq->sseqs + seq->first_free;
 	for (; sseq != stop; sseq++) {
 		info = sseq->info;
-		list_for_each_entry(p, &info->zone_list, zone_list) {
+		list_for_each_entry(p, &info->cluster_list, cluster_list) {
 			if (p->scope != scope)
 				continue;
 			tipc_group_add_member(grp, p->node, p->ref, p->lower);
@@ -899,7 +851,8 @@ static void tipc_purge_publications(struct net *net, struct name_seq *seq)
 	spin_lock_bh(&seq->lock);
 	sseq = seq->sseqs;
 	info = sseq->info;
-	list_for_each_entry_safe(publ, safe, &info->zone_list, zone_list) {
+	list_for_each_entry_safe(publ, safe, &info->cluster_list,
+				 cluster_list) {
 		tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node,
 					 publ->ref, publ->key);
 		kfree_rcu(publ, rcu);
@@ -948,17 +901,19 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
 	struct publication *p;
 
 	if (*last_publ) {
-		list_for_each_entry(p, &sseq->info->zone_list, zone_list)
+		list_for_each_entry(p, &sseq->info->cluster_list,
+				    cluster_list)
 			if (p->key == *last_publ)
 				break;
 		if (p->key != *last_publ)
 			return -EPIPE;
 	} else {
-		p = list_first_entry(&sseq->info->zone_list, struct publication,
-				     zone_list);
+		p = list_first_entry(&sseq->info->cluster_list,
+				     struct publication,
+				     cluster_list);
 	}
 
-	list_for_each_entry_from(p, &sseq->info->zone_list, zone_list) {
+	list_for_each_entry_from(p, &sseq->info->cluster_list, cluster_list) {
 		*last_publ = p->key;
 
 		hdr = genlmsg_put(msg->skb, msg->portid, msg->seq,
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 47f72cddfb39..a9063e25ee74 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -81,7 +81,6 @@ struct publication {
 	struct list_head pport_list;
 	struct list_head node_list;
 	struct list_head cluster_list;
-	struct list_head zone_list;
 	struct rcu_head rcu;
 };
 
@@ -102,8 +101,8 @@ struct name_table {
 int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
 
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
-int tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
-			   u32 scope, bool exact, struct list_head *dports);
+void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
+			    u32 scope, bool exact, struct list_head *dports);
 void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 			      u32 type, u32 domain);
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
-- 
cgit v1.2.3


From 935439cc48ef24f0e50396be3684a0f27e609363 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Mar 2018 16:48:54 +0100
Subject: tipc: merge two lists in struct publication

The size of struct publication can be reduced further. Membership in
lists 'nodesub_list' and 'local_list' is mutually exlusive, in that
remote publications use the former and local publications the latter.
We replace the two lists with one single, named 'binding_node' which
reflects what it really is.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_distr.c | 20 ++++++++++----------
 net/tipc/name_table.h |  5 ++---
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 11ce20505550..4c54fb37875a 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -91,10 +91,10 @@ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ)
 	struct sk_buff *skb;
 
 	if (publ->scope == TIPC_NODE_SCOPE) {
-		list_add_tail_rcu(&publ->local_list, &nt->node_scope);
+		list_add_tail_rcu(&publ->binding_node, &nt->node_scope);
 		return NULL;
 	}
-	list_add_tail_rcu(&publ->local_list, &nt->cluster_scope);
+	list_add_tail_rcu(&publ->binding_node, &nt->cluster_scope);
 
 	skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0);
 	if (!skb) {
@@ -115,7 +115,7 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ)
 	struct sk_buff *buf;
 	struct distr_item *item;
 
-	list_del(&publ->local_list);
+	list_del(&publ->binding_node);
 
 	if (publ->scope == TIPC_NODE_SCOPE)
 		return NULL;
@@ -147,7 +147,7 @@ static void named_distribute(struct net *net, struct sk_buff_head *list,
 			ITEM_SIZE) * ITEM_SIZE;
 	u32 msg_rem = msg_dsz;
 
-	list_for_each_entry(publ, pls, local_list) {
+	list_for_each_entry(publ, pls, binding_node) {
 		/* Prepare next buffer: */
 		if (!skb) {
 			skb = named_prepare_buf(net, PUBLICATION, msg_rem,
@@ -211,7 +211,7 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
 	p = tipc_nametbl_remove_publ(net, publ->type, publ->lower,
 				     publ->node, publ->ref, publ->key);
 	if (p)
-		tipc_node_unsubscribe(net, &p->nodesub_list, addr);
+		tipc_node_unsubscribe(net, &p->binding_node, addr);
 	spin_unlock_bh(&tn->nametbl_lock);
 
 	if (p != publ) {
@@ -246,7 +246,7 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr)
 {
 	struct publication *publ, *tmp;
 
-	list_for_each_entry_safe(publ, tmp, nsub_list, nodesub_list)
+	list_for_each_entry_safe(publ, tmp, nsub_list, binding_node)
 		tipc_publ_purge(net, publ, addr);
 	tipc_dist_queue_purge(net, addr);
 }
@@ -270,7 +270,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
 						TIPC_CLUSTER_SCOPE, node,
 						ntohl(i->ref), ntohl(i->key));
 		if (publ) {
-			tipc_node_subscribe(net, &publ->nodesub_list, node);
+			tipc_node_subscribe(net, &publ->binding_node, node);
 			return true;
 		}
 	} else if (dtype == WITHDRAWAL) {
@@ -279,7 +279,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
 						node, ntohl(i->ref),
 						ntohl(i->key));
 		if (publ) {
-			tipc_node_unsubscribe(net, &publ->nodesub_list, node);
+			tipc_node_unsubscribe(net, &publ->binding_node, node);
 			kfree_rcu(publ, rcu);
 			return true;
 		}
@@ -385,9 +385,9 @@ void tipc_named_reinit(struct net *net)
 
 	spin_lock_bh(&tn->nametbl_lock);
 
-	list_for_each_entry_rcu(publ, &nt->node_scope, local_list)
+	list_for_each_entry_rcu(publ, &nt->node_scope, binding_node)
 		publ->node = tn->own_addr;
-	list_for_each_entry_rcu(publ, &nt->cluster_scope, local_list)
+	list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node)
 		publ->node = tn->own_addr;
 
 	spin_unlock_bh(&tn->nametbl_lock);
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index a9063e25ee74..cb16bd883565 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -1,7 +1,7 @@
 /*
  * net/tipc/name_table.h: Include file for TIPC name table code
  *
- * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2000-2006, 2014-2018, Ericsson AB
  * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -76,8 +76,7 @@ struct publication {
 	u32 node;
 	u32 ref;
 	u32 key;
-	struct list_head nodesub_list;
-	struct list_head local_list;
+	struct list_head binding_node;
 	struct list_head pport_list;
 	struct list_head node_list;
 	struct list_head cluster_list;
-- 
cgit v1.2.3


From e50e73e10757ac86fcb1aaa986055049e060727a Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 15 Mar 2018 16:48:55 +0100
Subject: tipc: some name changes

We rename some lists and fields in struct publication both to make
the naming more consistent and to better reflect their roles. We
also update the descriptions of those lists.

node_list -> local_publ
cluster_list -> all_publ
pport_list -> binding_sock
ref -> port

There are no functional changes in this commit.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_distr.c |  12 ++---
 net/tipc/name_distr.h |   2 +-
 net/tipc/name_table.c | 143 ++++++++++++++++++++++++--------------------------
 net/tipc/name_table.h |  38 ++++++++------
 net/tipc/socket.c     |  14 ++---
 5 files changed, 106 insertions(+), 103 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 4c54fb37875a..28d095a7d8bb 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -56,7 +56,7 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
 	i->type = htonl(p->type);
 	i->lower = htonl(p->lower);
 	i->upper = htonl(p->upper);
-	i->ref = htonl(p->ref);
+	i->port = htonl(p->port);
 	i->key = htonl(p->key);
 }
 
@@ -209,15 +209,15 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
 
 	spin_lock_bh(&tn->nametbl_lock);
 	p = tipc_nametbl_remove_publ(net, publ->type, publ->lower,
-				     publ->node, publ->ref, publ->key);
+				     publ->node, publ->port, publ->key);
 	if (p)
 		tipc_node_unsubscribe(net, &p->binding_node, addr);
 	spin_unlock_bh(&tn->nametbl_lock);
 
 	if (p != publ) {
 		pr_err("Unable to remove publication from failed node\n"
-		       " (type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n",
-		       publ->type, publ->lower, publ->node, publ->ref,
+		       " (type=%u, lower=%u, node=0x%x, port=%u, key=%u)\n",
+		       publ->type, publ->lower, publ->node, publ->port,
 		       publ->key);
 	}
 
@@ -268,7 +268,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
 						ntohl(i->lower),
 						ntohl(i->upper),
 						TIPC_CLUSTER_SCOPE, node,
-						ntohl(i->ref), ntohl(i->key));
+						ntohl(i->port), ntohl(i->key));
 		if (publ) {
 			tipc_node_subscribe(net, &publ->binding_node, node);
 			return true;
@@ -276,7 +276,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
 	} else if (dtype == WITHDRAWAL) {
 		publ = tipc_nametbl_remove_publ(net, ntohl(i->type),
 						ntohl(i->lower),
-						node, ntohl(i->ref),
+						node, ntohl(i->port),
 						ntohl(i->key));
 		if (publ) {
 			tipc_node_unsubscribe(net, &publ->binding_node, node);
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
index 1264ba0af937..4753e628d7c4 100644
--- a/net/tipc/name_distr.h
+++ b/net/tipc/name_distr.h
@@ -63,7 +63,7 @@ struct distr_item {
 	__be32 type;
 	__be32 lower;
 	__be32 upper;
-	__be32 ref;
+	__be32 port;
 	__be32 key;
 };
 
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 6d7b4c7ff4b7..bbbfc0702634 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/name_table.c: TIPC name table code
  *
- * Copyright (c) 2000-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2000-2006, 2014-2018, Ericsson AB
  * Copyright (c) 2004-2008, 2010-2014, Wind River Systems
  * All rights reserved.
  *
@@ -51,11 +51,11 @@
 /**
  * struct name_info - name sequence publication info
  * @node_list: list of publications on own node of this <type,lower,upper>
- * @cluster_list: list of all publications of this <type,lower,upper>
+ * @all_publ: list of all publications of this <type,lower,upper>
  */
 struct name_info {
-	struct list_head node_list;
-	struct list_head cluster_list;
+	struct list_head local_publ;
+	struct list_head all_publ;
 };
 
 /**
@@ -102,7 +102,7 @@ static int hash(int x)
  * publ_create - create a publication structure
  */
 static struct publication *publ_create(u32 type, u32 lower, u32 upper,
-				       u32 scope, u32 node, u32 port_ref,
+				       u32 scope, u32 node, u32 port,
 				       u32 key)
 {
 	struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC);
@@ -116,9 +116,9 @@ static struct publication *publ_create(u32 type, u32 lower, u32 upper,
 	publ->upper = upper;
 	publ->scope = scope;
 	publ->node = node;
-	publ->ref = port_ref;
+	publ->port = port;
 	publ->key = key;
-	INIT_LIST_HEAD(&publ->pport_list);
+	INIT_LIST_HEAD(&publ->binding_sock);
 	return publ;
 }
 
@@ -237,9 +237,9 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 		info = sseq->info;
 
 		/* Check if an identical publication already exists */
-		list_for_each_entry(publ, &info->cluster_list, cluster_list) {
-			if ((publ->ref == port) && (publ->key == key) &&
-			    (!publ->node || (publ->node == node)))
+		list_for_each_entry(publ, &info->all_publ, all_publ) {
+			if (publ->port == port && publ->key == key &&
+			    (!publ->node || publ->node == node))
 				return NULL;
 		}
 	} else {
@@ -278,8 +278,8 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 			return NULL;
 		}
 
-		INIT_LIST_HEAD(&info->node_list);
-		INIT_LIST_HEAD(&info->cluster_list);
+		INIT_LIST_HEAD(&info->local_publ);
+		INIT_LIST_HEAD(&info->all_publ);
 
 		/* Insert new sub-sequence */
 		sseq = &nseq->sseqs[inspos];
@@ -298,15 +298,15 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 	if (!publ)
 		return NULL;
 
-	list_add(&publ->cluster_list, &info->cluster_list);
+	list_add(&publ->all_publ, &info->all_publ);
 
 	if (in_own_node(net, node))
-		list_add(&publ->node_list, &info->node_list);
+		list_add(&publ->local_publ, &info->local_publ);
 
 	/* Any subscriptions waiting for notification?  */
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
 		tipc_sub_report_overlap(s, publ->lower, publ->upper,
-					TIPC_PUBLISHED, publ->ref,
+					TIPC_PUBLISHED, publ->port,
 					publ->node, publ->scope,
 					created_subseq);
 	}
@@ -327,7 +327,7 @@ static struct publication *tipc_nameseq_insert_publ(struct net *net,
 static struct publication *tipc_nameseq_remove_publ(struct net *net,
 						    struct name_seq *nseq,
 						    u32 inst, u32 node,
-						    u32 ref, u32 key)
+						    u32 port, u32 key)
 {
 	struct publication *publ;
 	struct sub_seq *sseq = nameseq_find_subseq(nseq, inst);
@@ -342,20 +342,20 @@ static struct publication *tipc_nameseq_remove_publ(struct net *net,
 	info = sseq->info;
 
 	/* Locate publication, if it exists */
-	list_for_each_entry(publ, &info->cluster_list, cluster_list) {
-		if ((publ->key == key) && (publ->ref == ref) &&
-		    (!publ->node || (publ->node == node)))
+	list_for_each_entry(publ, &info->all_publ, all_publ) {
+		if (publ->key == key && publ->port == port &&
+		    (!publ->node || publ->node == node))
 			goto found;
 	}
 	return NULL;
 
 found:
-	list_del(&publ->cluster_list);
+	list_del(&publ->all_publ);
 	if (in_own_node(net, node))
-		list_del(&publ->node_list);
+		list_del(&publ->local_publ);
 
 	/* Contract subseq list if no more publications for that subseq */
-	if (list_empty(&info->cluster_list)) {
+	if (list_empty(&info->all_publ)) {
 		kfree(info);
 		free = &nseq->sseqs[nseq->first_free--];
 		memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq));
@@ -365,8 +365,9 @@ found:
 	/* Notify any waiting subscriptions */
 	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
 		tipc_sub_report_overlap(s, publ->lower, publ->upper,
-					TIPC_WITHDRAWN, publ->ref, publ->node,
-					publ->scope, removed_subseq);
+					TIPC_WITHDRAWN, publ->port,
+					publ->node, publ->scope,
+					removed_subseq);
 	}
 
 	return publ;
@@ -402,12 +403,12 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
 			struct name_info *info = sseq->info;
 			int must_report = 1;
 
-			list_for_each_entry(crs, &info->cluster_list,
-					    cluster_list) {
+			list_for_each_entry(crs, &info->all_publ, all_publ) {
 				tipc_sub_report_overlap(sub, sseq->lower,
 							sseq->upper,
 							TIPC_PUBLISHED,
-							crs->ref, crs->node,
+							crs->port,
+							crs->node,
 							crs->scope,
 							must_report);
 				must_report = 0;
@@ -460,7 +461,7 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
 }
 
 struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
-					     u32 lower, u32 node, u32 ref,
+					     u32 lower, u32 node, u32 port,
 					     u32 key)
 {
 	struct publication *publ;
@@ -470,7 +471,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 		return NULL;
 
 	spin_lock_bh(&seq->lock);
-	publ = tipc_nameseq_remove_publ(net, seq, lower, node, ref, key);
+	publ = tipc_nameseq_remove_publ(net, seq, lower, node, port, key);
 	if (!seq->first_free && list_empty(&seq->subscriptions)) {
 		hlist_del_init_rcu(&seq->ns_list);
 		kfree(seq->sseqs);
@@ -503,7 +504,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 	struct name_info *info;
 	struct publication *publ;
 	struct name_seq *seq;
-	u32 ref = 0;
+	u32 port = 0;
 	u32 node = 0;
 
 	if (!tipc_in_scope(*destnode, tn->own_addr))
@@ -521,42 +522,42 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 
 	/* Closest-First Algorithm */
 	if (likely(!*destnode)) {
-		if (!list_empty(&info->node_list)) {
-			publ = list_first_entry(&info->node_list,
+		if (!list_empty(&info->local_publ)) {
+			publ = list_first_entry(&info->local_publ,
 						struct publication,
-						node_list);
-			list_move_tail(&publ->node_list,
-				       &info->node_list);
+						local_publ);
+			list_move_tail(&publ->local_publ,
+				       &info->local_publ);
 		} else {
-			publ = list_first_entry(&info->cluster_list,
+			publ = list_first_entry(&info->all_publ,
 						struct publication,
-						cluster_list);
-			list_move_tail(&publ->cluster_list,
-				       &info->cluster_list);
+						all_publ);
+			list_move_tail(&publ->all_publ,
+				       &info->all_publ);
 		}
 	}
 
 	/* Round-Robin Algorithm */
 	else if (*destnode == tn->own_addr) {
-		if (list_empty(&info->node_list))
+		if (list_empty(&info->local_publ))
 			goto no_match;
-		publ = list_first_entry(&info->node_list, struct publication,
-					node_list);
-		list_move_tail(&publ->node_list, &info->node_list);
+		publ = list_first_entry(&info->local_publ, struct publication,
+					local_publ);
+		list_move_tail(&publ->local_publ, &info->local_publ);
 	} else {
-		publ = list_first_entry(&info->cluster_list, struct publication,
-					cluster_list);
-		list_move_tail(&publ->cluster_list, &info->cluster_list);
+		publ = list_first_entry(&info->all_publ, struct publication,
+					all_publ);
+		list_move_tail(&publ->all_publ, &info->all_publ);
 	}
 
-	ref = publ->ref;
+	port = publ->port;
 	node = publ->node;
 no_match:
 	spin_unlock_bh(&seq->lock);
 not_found:
 	rcu_read_unlock();
 	*destnode = node;
-	return ref;
+	return port;
 }
 
 bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
@@ -578,17 +579,16 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
 	sseq = nameseq_find_subseq(seq, instance);
 	if (likely(sseq)) {
 		info = sseq->info;
-		list_for_each_entry(publ, &info->cluster_list, cluster_list) {
+		list_for_each_entry(publ, &info->all_publ, all_publ) {
 			if (publ->scope != scope)
 				continue;
-			if (publ->ref == exclude && publ->node == self)
+			if (publ->port == exclude && publ->node == self)
 				continue;
-			tipc_dest_push(dsts, publ->node, publ->ref);
+			tipc_dest_push(dsts, publ->node, publ->port);
 			(*dstcnt)++;
 			if (all)
 				continue;
-			list_move_tail(&publ->cluster_list,
-				       &info->cluster_list);
+			list_move_tail(&publ->all_publ, &info->all_publ);
 			break;
 		}
 	}
@@ -619,9 +619,9 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
 		if (sseq->lower > upper)
 			break;
 		info = sseq->info;
-		list_for_each_entry(p, &info->node_list, node_list) {
+		list_for_each_entry(p, &info->local_publ, local_publ) {
 			if (p->scope == scope || (!exact && p->scope < scope))
-				tipc_dest_push(dports, 0, p->ref);
+				tipc_dest_push(dports, 0, p->port);
 		}
 	}
 	spin_unlock_bh(&seq->lock);
@@ -651,7 +651,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 	stop = seq->sseqs + seq->first_free;
 	for (; sseq != stop && sseq->lower <= upper; sseq++) {
 		info = sseq->info;
-		list_for_each_entry(publ, &info->cluster_list, cluster_list) {
+		list_for_each_entry(publ, &info->all_publ, all_publ) {
 			tipc_nlist_add(nodes, publ->node);
 		}
 	}
@@ -680,10 +680,10 @@ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 	stop = seq->sseqs + seq->first_free;
 	for (; sseq != stop; sseq++) {
 		info = sseq->info;
-		list_for_each_entry(p, &info->cluster_list, cluster_list) {
+		list_for_each_entry(p, &info->all_publ, all_publ) {
 			if (p->scope != scope)
 				continue;
-			tipc_group_add_member(grp, p->node, p->ref, p->lower);
+			tipc_group_add_member(grp, p->node, p->port, p->lower);
 		}
 	}
 	spin_unlock_bh(&seq->lock);
@@ -728,7 +728,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
 /**
  * tipc_nametbl_withdraw - withdraw name publication from network name tables
  */
-int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
+int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port,
 			  u32 key)
 {
 	struct publication *publ;
@@ -737,18 +737,18 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
 
 	spin_lock_bh(&tn->nametbl_lock);
 	publ = tipc_nametbl_remove_publ(net, type, lower, tn->own_addr,
-					ref, key);
+					port, key);
 	if (likely(publ)) {
 		tn->nametbl->local_publ_count--;
 		skb = tipc_named_withdraw(net, publ);
 		/* Any pending external events? */
 		tipc_named_process_backlog(net);
-		list_del_init(&publ->pport_list);
+		list_del_init(&publ->binding_sock);
 		kfree_rcu(publ, rcu);
 	} else {
 		pr_err("Unable to remove local publication\n"
-		       "(type=%u, lower=%u, ref=%u, key=%u)\n",
-		       type, lower, ref, key);
+		       "(type=%u, lower=%u, port=%u, key=%u)\n",
+		       type, lower, port, key);
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
 
@@ -851,10 +851,9 @@ static void tipc_purge_publications(struct net *net, struct name_seq *seq)
 	spin_lock_bh(&seq->lock);
 	sseq = seq->sseqs;
 	info = sseq->info;
-	list_for_each_entry_safe(publ, safe, &info->cluster_list,
-				 cluster_list) {
+	list_for_each_entry_safe(publ, safe, &info->all_publ, all_publ) {
 		tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node,
-					 publ->ref, publ->key);
+					 publ->port, publ->key);
 		kfree_rcu(publ, rcu);
 	}
 	hlist_del_init_rcu(&seq->ns_list);
@@ -901,19 +900,17 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
 	struct publication *p;
 
 	if (*last_publ) {
-		list_for_each_entry(p, &sseq->info->cluster_list,
-				    cluster_list)
+		list_for_each_entry(p, &sseq->info->all_publ, all_publ)
 			if (p->key == *last_publ)
 				break;
 		if (p->key != *last_publ)
 			return -EPIPE;
 	} else {
-		p = list_first_entry(&sseq->info->cluster_list,
-				     struct publication,
-				     cluster_list);
+		p = list_first_entry(&sseq->info->all_publ, struct publication,
+				     all_publ);
 	}
 
-	list_for_each_entry_from(p, &sseq->info->cluster_list, cluster_list) {
+	list_for_each_entry_from(p, &sseq->info->all_publ, all_publ) {
 		*last_publ = p->key;
 
 		hdr = genlmsg_put(msg->skb, msg->portid, msg->seq,
@@ -940,7 +937,7 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
 			goto publ_msg_full;
 		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->node))
 			goto publ_msg_full;
-		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->ref))
+		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->port))
 			goto publ_msg_full;
 		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key))
 			goto publ_msg_full;
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index cb16bd883565..34a4ccb907aa 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -54,19 +54,22 @@ struct tipc_group;
  * @type: name sequence type
  * @lower: name sequence lower bound
  * @upper: name sequence upper bound
- * @scope: scope of publication
- * @node: network address of publishing port's node
- * @ref: publishing port
- * @key: publication key
- * @nodesub_list: subscription to "node down" event (off-node publication only)
- * @local_list: adjacent entries in list of publications made by this node
- * @pport_list: adjacent entries in list of publications made by this port
- * @node_list: adjacent matching name seq publications with >= node scope
- * @cluster_list: adjacent matching name seq publications with >= cluster scope
- * @zone_list: adjacent matching name seq publications with >= zone scope
+ * @scope: scope of publication, TIPC_NODE_SCOPE or TIPC_CLUSTER_SCOPE
+ * @node: network address of publishing socket's node
+ * @port: publishing port
+ * @key: publication key, unique across the cluster
+ * @binding_node: all publications from the same node which bound this one
+ * - Remote publications: in node->publ_list
+ *   Used by node/name distr to withdraw publications when node is lost
+ * - Local/node scope publications: in name_table->node_scope list
+ * - Local/cluster scope publications: in name_table->cluster_scope list
+ * @binding_sock: all publications from the same socket which bound this one
+ *   Used by socket to withdraw publications when socket is unbound/released
+ * @local_publ: list of identical publications made from this node
+ *   Used by closest_first and multicast receive lookup algorithms
+ * @all_publ: all publications identical to this one, whatever node and scope
+ *   Used by round-robin lookup algorithm
  * @rcu: RCU callback head used for deferred freeing
- *
- * Note that the node list, cluster list, and zone list are circular lists.
  */
 struct publication {
 	u32 type;
@@ -74,12 +77,12 @@ struct publication {
 	u32 upper;
 	u32 scope;
 	u32 node;
-	u32 ref;
+	u32 port;
 	u32 key;
 	struct list_head binding_node;
-	struct list_head pport_list;
-	struct list_head node_list;
-	struct list_head cluster_list;
+	struct list_head binding_sock;
+	struct list_head local_publ;
+	struct list_head all_publ;
 	struct rcu_head rcu;
 };
 
@@ -87,7 +90,10 @@ struct publication {
  * struct name_table - table containing all existing port name publications
  * @seq_hlist: name sequence hash lists
  * @node_scope: all local publications with node scope
+ *               - used by name_distr during re-init of name table
  * @cluster_scope: all local publications with cluster scope
+ *               - used by name_distr to send bulk updates to new nodes
+ *               - used by name_distr during re-init of name table
  * @local_publ_count: number of publications issued by this node
  */
 struct name_table {
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 910d3827f499..a4a9148d4629 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2605,7 +2605,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope,
 	if (unlikely(!publ))
 		return -EINVAL;
 
-	list_add(&publ->pport_list, &tsk->publications);
+	list_add(&publ->binding_sock, &tsk->publications);
 	tsk->pub_count++;
 	tsk->published = 1;
 	return 0;
@@ -2622,7 +2622,7 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 	if (scope != TIPC_NODE_SCOPE)
 		scope = TIPC_CLUSTER_SCOPE;
 
-	list_for_each_entry_safe(publ, safe, &tsk->publications, pport_list) {
+	list_for_each_entry_safe(publ, safe, &tsk->publications, binding_sock) {
 		if (seq) {
 			if (publ->scope != scope)
 				continue;
@@ -2633,12 +2633,12 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 			if (publ->upper != seq->upper)
 				break;
 			tipc_nametbl_withdraw(net, publ->type, publ->lower,
-					      publ->ref, publ->key);
+					      publ->port, publ->key);
 			rc = 0;
 			break;
 		}
 		tipc_nametbl_withdraw(net, publ->type, publ->lower,
-				      publ->ref, publ->key);
+				      publ->port, publ->key);
 		rc = 0;
 	}
 	if (list_empty(&tsk->publications))
@@ -3292,7 +3292,7 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb,
 	struct publication *p;
 
 	if (*last_publ) {
-		list_for_each_entry(p, &tsk->publications, pport_list) {
+		list_for_each_entry(p, &tsk->publications, binding_sock) {
 			if (p->key == *last_publ)
 				break;
 		}
@@ -3309,10 +3309,10 @@ static int __tipc_nl_list_sk_publ(struct sk_buff *skb,
 		}
 	} else {
 		p = list_first_entry(&tsk->publications, struct publication,
-				     pport_list);
+				     binding_sock);
 	}
 
-	list_for_each_entry_from(p, &tsk->publications, pport_list) {
+	list_for_each_entry_from(p, &tsk->publications, binding_sock) {
 		err = __tipc_nl_add_sk_publ(skb, cb, p);
 		if (err) {
 			*last_publ = p->key;
-- 
cgit v1.2.3


From 53d0e83f9329aa51dcc205b514dbee05cb4df309 Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Thu, 15 Mar 2018 03:54:26 -0700
Subject: rds: tcp: must use spin_lock_irq* and not spin_lock_bh with
 rds_tcp_conn_lock

rds_tcp_connection allocation/free management has the potential to be
called from __rds_conn_create after IRQs have been disabled, so
spin_[un]lock_bh cannot be used with rds_tcp_conn_lock.

Bottom-halves that need to synchronize for critical sections protected
by rds_tcp_conn_lock should instead use rds_destroy_pending() correctly.

Reported-by: syzbot+c68e51bb5e699d3f8d91@syzkaller.appspotmail.com
Fixes: ebeeb1ad9b8a ("rds: tcp: use rds_destroy_pending() to synchronize
       netns/module teardown and rds connection/workq management")
Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index eb04e7fa2467..08ea9cd5c2f6 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -272,13 +272,14 @@ static int rds_tcp_laddr_check(struct net *net, __be32 addr)
 static void rds_tcp_conn_free(void *arg)
 {
 	struct rds_tcp_connection *tc = arg;
+	unsigned long flags;
 
 	rdsdebug("freeing tc %p\n", tc);
 
-	spin_lock_bh(&rds_tcp_conn_lock);
+	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
 	if (!tc->t_tcp_node_detached)
 		list_del(&tc->t_tcp_node);
-	spin_unlock_bh(&rds_tcp_conn_lock);
+	spin_unlock_irqrestore(&rds_tcp_conn_lock, flags);
 
 	kmem_cache_free(rds_tcp_conn_slab, tc);
 }
@@ -308,13 +309,13 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 		rdsdebug("rds_conn_path [%d] tc %p\n", i,
 			 conn->c_path[i].cp_transport_data);
 	}
-	spin_lock_bh(&rds_tcp_conn_lock);
+	spin_lock_irq(&rds_tcp_conn_lock);
 	for (i = 0; i < RDS_MPATH_WORKERS; i++) {
 		tc = conn->c_path[i].cp_transport_data;
 		tc->t_tcp_node_detached = false;
 		list_add_tail(&tc->t_tcp_node, &rds_tcp_conn_list);
 	}
-	spin_unlock_bh(&rds_tcp_conn_lock);
+	spin_unlock_irq(&rds_tcp_conn_lock);
 fail:
 	if (ret) {
 		for (j = 0; j < i; j++)
@@ -527,7 +528,7 @@ static void rds_tcp_kill_sock(struct net *net)
 
 	rtn->rds_tcp_listen_sock = NULL;
 	rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
-	spin_lock_bh(&rds_tcp_conn_lock);
+	spin_lock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
@@ -540,7 +541,7 @@ static void rds_tcp_kill_sock(struct net *net)
 			tc->t_tcp_node_detached = true;
 		}
 	}
-	spin_unlock_bh(&rds_tcp_conn_lock);
+	spin_unlock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node)
 		rds_conn_destroy(tc->t_cpath->cp_conn);
 }
@@ -588,7 +589,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 {
 	struct rds_tcp_connection *tc, *_tc;
 
-	spin_lock_bh(&rds_tcp_conn_lock);
+	spin_lock_irq(&rds_tcp_conn_lock);
 	list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
 		struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
 
@@ -598,7 +599,7 @@ static void rds_tcp_sysctl_reset(struct net *net)
 		/* reconnect with new parameters */
 		rds_conn_path_drop(tc->t_cpath, false);
 	}
-	spin_unlock_bh(&rds_tcp_conn_lock);
+	spin_unlock_irq(&rds_tcp_conn_lock);
 }
 
 static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
-- 
cgit v1.2.3


From d47d08c8ca052df3d9fde7cfff518660335b16e7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 16 Mar 2018 23:32:51 +0000
Subject: sctp: use proc_remove_subtree()

use proc_remove_subtree() for subtree removal, both on setup failure
halfway through and on teardown.  No need to make simple things
complex...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/objcnt.c   |  8 -----
 net/sctp/proc.c     | 90 +++++++++++++----------------------------------------
 net/sctp/protocol.c | 59 ++++-------------------------------
 3 files changed, 27 insertions(+), 130 deletions(-)

(limited to 'net')

diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c
index aeea6da81441..fd2684ad94c8 100644
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -130,11 +130,3 @@ void sctp_dbg_objcnt_init(struct net *net)
 	if (!ent)
 		pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
 }
-
-/* Cleanup the objcount entry in the proc filesystem.  */
-void sctp_dbg_objcnt_exit(struct net *net)
-{
-	remove_proc_entry("sctp_dbg_objcnt", net->sctp.proc_net_sctp);
-}
-
-
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 537545ebcb0e..17d0155d9de3 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -101,25 +101,6 @@ static const struct file_operations sctp_snmp_seq_fops = {
 	.release = single_release_net,
 };
 
-/* Set up the proc fs entry for 'snmp' object. */
-int __net_init sctp_snmp_proc_init(struct net *net)
-{
-	struct proc_dir_entry *p;
-
-	p = proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_snmp_seq_fops);
-	if (!p)
-		return -ENOMEM;
-
-	return 0;
-}
-
-/* Cleanup the proc fs entry for 'snmp' object. */
-void sctp_snmp_proc_exit(struct net *net)
-{
-	remove_proc_entry("snmp", net->sctp.proc_net_sctp);
-}
-
 /* Dump local addresses of an association/endpoint. */
 static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb)
 {
@@ -259,25 +240,6 @@ static const struct file_operations sctp_eps_seq_fops = {
 	.release = seq_release_net,
 };
 
-/* Set up the proc fs entry for 'eps' object. */
-int __net_init sctp_eps_proc_init(struct net *net)
-{
-	struct proc_dir_entry *p;
-
-	p = proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_eps_seq_fops);
-	if (!p)
-		return -ENOMEM;
-
-	return 0;
-}
-
-/* Cleanup the proc fs entry for 'eps' object. */
-void sctp_eps_proc_exit(struct net *net)
-{
-	remove_proc_entry("eps", net->sctp.proc_net_sctp);
-}
-
 struct sctp_ht_iter {
 	struct seq_net_private p;
 	struct rhashtable_iter hti;
@@ -390,25 +352,6 @@ static const struct file_operations sctp_assocs_seq_fops = {
 	.release = seq_release_net,
 };
 
-/* Set up the proc fs entry for 'assocs' object. */
-int __net_init sctp_assocs_proc_init(struct net *net)
-{
-	struct proc_dir_entry *p;
-
-	p = proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_assocs_seq_fops);
-	if (!p)
-		return -ENOMEM;
-
-	return 0;
-}
-
-/* Cleanup the proc fs entry for 'assocs' object. */
-void sctp_assocs_proc_exit(struct net *net)
-{
-	remove_proc_entry("assocs", net->sctp.proc_net_sctp);
-}
-
 static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
 {
 	struct sctp_association *assoc;
@@ -488,12 +431,6 @@ static const struct seq_operations sctp_remaddr_ops = {
 	.show  = sctp_remaddr_seq_show,
 };
 
-/* Cleanup the proc fs entry for 'remaddr' object. */
-void sctp_remaddr_proc_exit(struct net *net)
-{
-	remove_proc_entry("remaddr", net->sctp.proc_net_sctp);
-}
-
 static int sctp_remaddr_seq_open(struct inode *inode, struct file *file)
 {
 	return seq_open_net(inode, file, &sctp_remaddr_ops,
@@ -507,13 +444,28 @@ static const struct file_operations sctp_remaddr_seq_fops = {
 	.release = seq_release_net,
 };
 
-int __net_init sctp_remaddr_proc_init(struct net *net)
+/* Set up the proc fs entry for the SCTP protocol. */
+int __net_init sctp_proc_init(struct net *net)
 {
-	struct proc_dir_entry *p;
-
-	p = proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_remaddr_seq_fops);
-	if (!p)
+	net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net);
+	if (!net->sctp.proc_net_sctp)
 		return -ENOMEM;
+	if (!proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp,
+			&sctp_snmp_seq_fops))
+		goto cleanup;
+	if (!proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp,
+			&sctp_eps_seq_fops))
+		goto cleanup;
+	if (!proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp,
+			&sctp_assocs_seq_fops))
+		goto cleanup;
+	if (!proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
+			&sctp_remaddr_seq_fops))
+		goto cleanup;
 	return 0;
+
+cleanup:
+	remove_proc_subtree("sctp", net->proc_net);
+	net->sctp.proc_net_sctp = NULL;
+	return -ENOMEM;
 }
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 606361ee9e4a..493b817f6a2a 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -80,56 +80,6 @@ long sysctl_sctp_mem[3];
 int sysctl_sctp_rmem[3];
 int sysctl_sctp_wmem[3];
 
-/* Set up the proc fs entry for the SCTP protocol. */
-static int __net_init sctp_proc_init(struct net *net)
-{
-#ifdef CONFIG_PROC_FS
-	net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net);
-	if (!net->sctp.proc_net_sctp)
-		goto out_proc_net_sctp;
-	if (sctp_snmp_proc_init(net))
-		goto out_snmp_proc_init;
-	if (sctp_eps_proc_init(net))
-		goto out_eps_proc_init;
-	if (sctp_assocs_proc_init(net))
-		goto out_assocs_proc_init;
-	if (sctp_remaddr_proc_init(net))
-		goto out_remaddr_proc_init;
-
-	return 0;
-
-out_remaddr_proc_init:
-	sctp_assocs_proc_exit(net);
-out_assocs_proc_init:
-	sctp_eps_proc_exit(net);
-out_eps_proc_init:
-	sctp_snmp_proc_exit(net);
-out_snmp_proc_init:
-	remove_proc_entry("sctp", net->proc_net);
-	net->sctp.proc_net_sctp = NULL;
-out_proc_net_sctp:
-	return -ENOMEM;
-#endif /* CONFIG_PROC_FS */
-	return 0;
-}
-
-/* Clean up the proc fs entry for the SCTP protocol.
- * Note: Do not make this __exit as it is used in the init error
- * path.
- */
-static void sctp_proc_exit(struct net *net)
-{
-#ifdef CONFIG_PROC_FS
-	sctp_snmp_proc_exit(net);
-	sctp_eps_proc_exit(net);
-	sctp_assocs_proc_exit(net);
-	sctp_remaddr_proc_exit(net);
-
-	remove_proc_entry("sctp", net->proc_net);
-	net->sctp.proc_net_sctp = NULL;
-#endif
-}
-
 /* Private helper to extract ipv4 address and stash them in
  * the protocol structure.
  */
@@ -1285,10 +1235,12 @@ static int __net_init sctp_defaults_init(struct net *net)
 	if (status)
 		goto err_init_mibs;
 
+#ifdef CONFIG_PROC_FS
 	/* Initialize proc fs directory.  */
 	status = sctp_proc_init(net);
 	if (status)
 		goto err_init_proc;
+#endif
 
 	sctp_dbg_objcnt_init(net);
 
@@ -1320,9 +1272,10 @@ static void __net_exit sctp_defaults_exit(struct net *net)
 	sctp_free_addr_wq(net);
 	sctp_free_local_addr_list(net);
 
-	sctp_dbg_objcnt_exit(net);
-
-	sctp_proc_exit(net);
+#ifdef CONFIG_PROC_FS
+	remove_proc_subtree("sctp", net->proc_net);
+	net->sctp.proc_net_sctp = NULL;
+#endif
 	cleanup_sctp_mibs(net);
 	sctp_sysctl_net_unregister(net);
 }
-- 
cgit v1.2.3


From 2c3682f0be97a5f57c6c8b40fa154dfc77efb461 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 18 Mar 2018 12:56:49 -0700
Subject: sock: make static tls function alloc_sg generic sock helper

The TLS ULP module builds scatterlists from a sock using
page_frag_refill(). This is going to be useful for other ULPs
so move it into sock file for more general use.

In the process remove useless goto at end of while loop.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/sock.c  | 56 +++++++++++++++++++++++++++++++++++++++++++++
 net/tls/tls_sw.c | 69 ++++++--------------------------------------------------
 2 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index 27f218bba43f..f68dff0d7bc4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2239,6 +2239,62 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
 }
 EXPORT_SYMBOL(sk_page_frag_refill);
 
+int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
+		int *sg_num_elem, unsigned int *sg_size,
+		int first_coalesce)
+{
+	struct page_frag *pfrag;
+	unsigned int size = *sg_size;
+	int num_elem = *sg_num_elem, use = 0, rc = 0;
+	struct scatterlist *sge;
+	unsigned int orig_offset;
+
+	len -= size;
+	pfrag = sk_page_frag(sk);
+
+	while (len > 0) {
+		if (!sk_page_frag_refill(sk, pfrag)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		use = min_t(int, len, pfrag->size - pfrag->offset);
+
+		if (!sk_wmem_schedule(sk, use)) {
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		sk_mem_charge(sk, use);
+		size += use;
+		orig_offset = pfrag->offset;
+		pfrag->offset += use;
+
+		sge = sg + num_elem - 1;
+		if (num_elem > first_coalesce && sg_page(sg) == pfrag->page &&
+		    sg->offset + sg->length == orig_offset) {
+			sg->length += use;
+		} else {
+			sge++;
+			sg_unmark_end(sge);
+			sg_set_page(sge, pfrag->page, use, orig_offset);
+			get_page(pfrag->page);
+			++num_elem;
+			if (num_elem == MAX_SKB_FRAGS) {
+				rc = -ENOSPC;
+				break;
+			}
+		}
+
+		len -= use;
+	}
+out:
+	*sg_size = size;
+	*sg_num_elem = num_elem;
+	return rc;
+}
+EXPORT_SYMBOL(sk_alloc_sg);
+
 static void __lock_sock(struct sock *sk)
 	__releases(&sk->sk_lock.slock)
 	__acquires(&sk->sk_lock.slock)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index f26376e954ae..0fc8a24c6473 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -87,71 +87,16 @@ static void trim_both_sgl(struct sock *sk, int target_size)
 		target_size);
 }
 
-static int alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
-		    int *sg_num_elem, unsigned int *sg_size,
-		    int first_coalesce)
-{
-	struct page_frag *pfrag;
-	unsigned int size = *sg_size;
-	int num_elem = *sg_num_elem, use = 0, rc = 0;
-	struct scatterlist *sge;
-	unsigned int orig_offset;
-
-	len -= size;
-	pfrag = sk_page_frag(sk);
-
-	while (len > 0) {
-		if (!sk_page_frag_refill(sk, pfrag)) {
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		use = min_t(int, len, pfrag->size - pfrag->offset);
-
-		if (!sk_wmem_schedule(sk, use)) {
-			rc = -ENOMEM;
-			goto out;
-		}
-
-		sk_mem_charge(sk, use);
-		size += use;
-		orig_offset = pfrag->offset;
-		pfrag->offset += use;
-
-		sge = sg + num_elem - 1;
-		if (num_elem > first_coalesce && sg_page(sg) == pfrag->page &&
-		    sg->offset + sg->length == orig_offset) {
-			sg->length += use;
-		} else {
-			sge++;
-			sg_unmark_end(sge);
-			sg_set_page(sge, pfrag->page, use, orig_offset);
-			get_page(pfrag->page);
-			++num_elem;
-			if (num_elem == MAX_SKB_FRAGS) {
-				rc = -ENOSPC;
-				break;
-			}
-		}
-
-		len -= use;
-	}
-	goto out;
-
-out:
-	*sg_size = size;
-	*sg_num_elem = num_elem;
-	return rc;
-}
-
 static int alloc_encrypted_sg(struct sock *sk, int len)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
 	int rc = 0;
 
-	rc = alloc_sg(sk, len, ctx->sg_encrypted_data,
-		      &ctx->sg_encrypted_num_elem, &ctx->sg_encrypted_size, 0);
+	rc = sk_alloc_sg(sk, len,
+			 ctx->sg_encrypted_data,
+			 &ctx->sg_encrypted_num_elem,
+			 &ctx->sg_encrypted_size, 0);
 
 	return rc;
 }
@@ -162,9 +107,9 @@ static int alloc_plaintext_sg(struct sock *sk, int len)
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
 	int rc = 0;
 
-	rc = alloc_sg(sk, len, ctx->sg_plaintext_data,
-		      &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
-		      tls_ctx->pending_open_record_frags);
+	rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data,
+			 &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
+			 tls_ctx->pending_open_record_frags);
 
 	return rc;
 }
-- 
cgit v1.2.3


From 312fc2b4c82e96a48cb2d0da2bd4816eb253c499 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 18 Mar 2018 12:57:00 -0700
Subject: net: do_tcp_sendpages flag to avoid SKBTX_SHARED_FRAG

When calling do_tcp_sendpages() from in kernel and we know the data
has no references from user side we can omit SKBTX_SHARED_FRAG flag.
This patch adds an internal flag, NO_SKBTX_SHARED_FRAG that can be used
to omit setting SKBTX_SHARED_FRAG.

The flag is not exposed to userspace because the sendpage call from
the splice logic masks out all bits except MSG_MORE.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv4/tcp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index fb350f740f69..f90ec24c2cc8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -994,7 +994,9 @@ new_segment:
 			get_page(page);
 			skb_fill_page_desc(skb, i, page, offset, copy);
 		}
-		skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+		if (!(flags & MSG_NO_SHARED_FRAGS))
+			skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
 
 		skb->len += copy;
 		skb->data_len += copy;
-- 
cgit v1.2.3


From 8c05dbf04b2882c3c0bc43fe7668c720210877f3 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 18 Mar 2018 12:57:05 -0700
Subject: net: generalize sk_alloc_sg to work with scatterlist rings

The current implementation of sk_alloc_sg expects scatterlist to always
start at entry 0 and complete at entry MAX_SKB_FRAGS.

Future patches will want to support starting at arbitrary offset into
scatterlist so add an additional sg_start parameters and then default
to the current values in TLS code paths.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/sock.c  | 27 ++++++++++++++++-----------
 net/tls/tls_sw.c |  4 ++--
 2 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/core/sock.c b/net/core/sock.c
index f68dff0d7bc4..4f92c2910200 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2240,19 +2240,20 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
 EXPORT_SYMBOL(sk_page_frag_refill);
 
 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
-		int *sg_num_elem, unsigned int *sg_size,
+		int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
 		int first_coalesce)
 {
+	int sg_curr = *sg_curr_index, use = 0, rc = 0;
+	unsigned int size = *sg_curr_size;
 	struct page_frag *pfrag;
-	unsigned int size = *sg_size;
-	int num_elem = *sg_num_elem, use = 0, rc = 0;
 	struct scatterlist *sge;
-	unsigned int orig_offset;
 
 	len -= size;
 	pfrag = sk_page_frag(sk);
 
 	while (len > 0) {
+		unsigned int orig_offset;
+
 		if (!sk_page_frag_refill(sk, pfrag)) {
 			rc = -ENOMEM;
 			goto out;
@@ -2270,17 +2271,21 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
 		orig_offset = pfrag->offset;
 		pfrag->offset += use;
 
-		sge = sg + num_elem - 1;
-		if (num_elem > first_coalesce && sg_page(sg) == pfrag->page &&
+		sge = sg + sg_curr - 1;
+		if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
 		    sg->offset + sg->length == orig_offset) {
 			sg->length += use;
 		} else {
-			sge++;
+			sge = sg + sg_curr;
 			sg_unmark_end(sge);
 			sg_set_page(sge, pfrag->page, use, orig_offset);
 			get_page(pfrag->page);
-			++num_elem;
-			if (num_elem == MAX_SKB_FRAGS) {
+			sg_curr++;
+
+			if (sg_curr == MAX_SKB_FRAGS)
+				sg_curr = 0;
+
+			if (sg_curr == sg_start) {
 				rc = -ENOSPC;
 				break;
 			}
@@ -2289,8 +2294,8 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
 		len -= use;
 	}
 out:
-	*sg_size = size;
-	*sg_num_elem = num_elem;
+	*sg_curr_size = size;
+	*sg_curr_index = sg_curr;
 	return rc;
 }
 EXPORT_SYMBOL(sk_alloc_sg);
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 0fc8a24c6473..057a558ed6d7 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -94,7 +94,7 @@ static int alloc_encrypted_sg(struct sock *sk, int len)
 	int rc = 0;
 
 	rc = sk_alloc_sg(sk, len,
-			 ctx->sg_encrypted_data,
+			 ctx->sg_encrypted_data, 0,
 			 &ctx->sg_encrypted_num_elem,
 			 &ctx->sg_encrypted_size, 0);
 
@@ -107,7 +107,7 @@ static int alloc_plaintext_sg(struct sock *sk, int len)
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
 	int rc = 0;
 
-	rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data,
+	rc = sk_alloc_sg(sk, len, ctx->sg_plaintext_data, 0,
 			 &ctx->sg_plaintext_num_elem, &ctx->sg_plaintext_size,
 			 tls_ctx->pending_open_record_frags);
 
-- 
cgit v1.2.3


From 4f738adba30a7cfc006f605707e7aee847ffefa0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 18 Mar 2018 12:57:10 -0700
Subject: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data

This implements a BPF ULP layer to allow policy enforcement and
monitoring at the socket layer. In order to support this a new
program type BPF_PROG_TYPE_SK_MSG is used to run the policy at
the sendmsg/sendpage hook. To attach the policy to sockets a
sockmap is used with a new program attach type BPF_SK_MSG_VERDICT.

Similar to previous sockmap usages when a sock is added to a
sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT
program type attached then the BPF ULP layer is created on the
socket and the attached BPF_PROG_TYPE_SK_MSG program is run for
every msg in sendmsg case and page/offset in sendpage case.

BPF_PROG_TYPE_SK_MSG Semantics/API:

BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and
SK_DROP. Returning SK_DROP free's the copied data in the sendmsg
case and in the sendpage case leaves the data untouched. Both cases
return -EACESS to the user. Returning SK_PASS will allow the msg to
be sent.

In the sendmsg case data is copied into kernel space buffers before
running the BPF program. The kernel space buffers are stored in a
scatterlist object where each element is a kernel memory buffer.
Some effort is made to coalesce data from the sendmsg call here.
For example a sendmsg call with many one byte iov entries will
likely be pushed into a single entry. The BPF program is run with
data pointers (start/end) pointing to the first sg element.

In the sendpage case data is not copied. We opt not to copy the
data by default here, because the BPF infrastructure does not
know what bytes will be needed nor when they will be needed. So
copying all bytes may be wasteful. Because of this the initial
start/end data pointers are (0,0). Meaning no data can be read or
written. This avoids reading data that may be modified by the
user. A new helper is added later in this series if reading and
writing the data is needed. The helper call will do a copy by
default so that the page is exclusively owned by the BPF call.

The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg
in the sendmsg() case and the entire page/offset in the sendpage case.
This avoids ambiguity on how to handle mixed return codes in the
sendmsg case. Again a helper is added later in the series if
a verdict needs to apply to multiple system calls and/or only
a subpart of the currently being processed message.

The helper msg_redirect_map() can be used to select the socket to
send the data on. This is used similar to existing redirect use
cases. This allows policy to redirect msgs.

Pseudo code simple example:

The basic logic to attach a program to a socket is as follows,

  // load the programs
  bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG,
		&obj, &msg_prog);

  // lookup the sockmap
  bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map");

  // get fd for sockmap
  map_fd_msg = bpf_map__fd(bpf_map_msg);

  // attach program to sockmap
  bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0);

Adding sockets to the map is done in the normal way,

  // Add a socket 'fd' to sockmap at location 'i'
  bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY);

After the above any socket attached to "my_sock_map", in this case
'fd', will run the BPF msg verdict program (msg_prog) on every
sendmsg and sendpage system call.

For a complete example see BPF selftests or sockmap samples.

Implementation notes:

It seemed the simplest, to me at least, to use a refcnt to ensure
psock is not lost across the sendmsg copy into the sg, the bpf program
running on the data in sg_data, and the final pass to the TCP stack.
Some performance testing may show a better method to do this and avoid
the refcnt cost, but for now use the simpler method.

Another item that will come after basic support is in place is
supporting MSG_MORE flag. At the moment we call sendpages even if
the MSG_MORE flag is set. An enhancement would be to collect the
pages into a larger scatterlist and pass down the stack. Notice that
bpf_tcp_sendmsg() could support this with some additional state saved
across sendmsg calls. I built the code to support this without having
to do refactoring work. Other features TBD include ZEROCOPY and the
TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series
shortly.

Future work could improve size limits on the scatterlist rings used
here. Currently, we use MAX_SKB_FRAGS simply because this was being
used already in the TLS case. Future work could extend the kernel sk
APIs to tune this depending on workload. This is a trade-off
between memory usage and throughput performance.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 33edfa8372fd..2b6c47597eab 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1890,6 +1890,44 @@ static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
+	   struct bpf_map *, map, u32, key, u64, flags)
+{
+	/* If user passes invalid input drop the packet. */
+	if (unlikely(flags))
+		return SK_DROP;
+
+	msg->key = key;
+	msg->flags = flags;
+	msg->map = map;
+
+	return SK_PASS;
+}
+
+struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
+{
+	struct sock *sk = NULL;
+
+	if (msg->map) {
+		sk = __sock_map_lookup_elem(msg->map, msg->key);
+
+		msg->key = 0;
+		msg->map = NULL;
+	}
+
+	return sk;
+}
+
+static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+	.func           = bpf_msg_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -3591,6 +3629,16 @@ static const struct bpf_func_proto *
 	}
 }
 
+static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_msg_redirect_map:
+		return &bpf_msg_redirect_map_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -3980,6 +4028,32 @@ static bool sk_skb_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
+static bool sk_msg_is_valid_access(int off, int size,
+				   enum bpf_access_type type,
+				   struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case offsetof(struct sk_msg_md, data):
+		info->reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct sk_msg_md, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
+	if (off < 0 || off >= sizeof(struct sk_msg_md))
+		return false;
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u64))
+		return false;
+
+	return true;
+}
+
 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				  const struct bpf_insn *si,
 				  struct bpf_insn *insn_buf,
@@ -4778,6 +4852,29 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
+				     const struct bpf_insn *si,
+				     struct bpf_insn *insn_buf,
+				     struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct sk_msg_md, data):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, data));
+		break;
+	case offsetof(struct sk_msg_md, data_end):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, data_end));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 const struct bpf_verifier_ops sk_filter_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
@@ -4868,6 +4965,15 @@ const struct bpf_verifier_ops sk_skb_verifier_ops = {
 const struct bpf_prog_ops sk_skb_prog_ops = {
 };
 
+const struct bpf_verifier_ops sk_msg_verifier_ops = {
+	.get_func_proto		= sk_msg_func_proto,
+	.is_valid_access	= sk_msg_is_valid_access,
+	.convert_ctx_access	= sk_msg_convert_ctx_access,
+};
+
+const struct bpf_prog_ops sk_msg_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.2.3


From 2a100317c9ebc204a166f16294884fbf9da074ce Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 18 Mar 2018 12:57:15 -0700
Subject: bpf: sockmap, add bpf_msg_apply_bytes() helper

A single sendmsg or sendfile system call can contain multiple logical
messages that a BPF program may want to read and apply a verdict. But,
without an apply_bytes helper any verdict on the data applies to all
bytes in the sendmsg/sendfile. Alternatively, a BPF program may only
care to read the first N bytes of a msg. If the payload is large say
MB or even GB setting up and calling the BPF program repeatedly for
all bytes, even though the verdict is already known, creates
unnecessary overhead.

To allow BPF programs to control how many bytes a given verdict
applies to we implement a bpf_msg_apply_bytes() helper. When called
from within a BPF program this sets a counter, internal to the
BPF infrastructure, that applies the last verdict to the next N
bytes. If the N is smaller than the current data being processed
from a sendmsg/sendfile call, the first N bytes will be sent and
the BPF program will be re-run with start_data pointing to the N+1
byte. If N is larger than the current data being processed the
BPF verdict will be applied to multiple sendmsg/sendfile calls
until N bytes are consumed.

Note1 if a socket closes with apply_bytes counter non-zero this
is not a problem because data is not being buffered for N bytes
and is sent as its received.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 2b6c47597eab..17d6775f5431 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1928,6 +1928,20 @@ static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
 	.arg4_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+	msg->apply_bytes = bytes;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
+	.func           = bpf_msg_apply_bytes,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -3634,6 +3648,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
 	switch (func_id) {
 	case BPF_FUNC_msg_redirect_map:
 		return &bpf_msg_redirect_map_proto;
+	case BPF_FUNC_msg_apply_bytes:
+		return &bpf_msg_apply_bytes_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 91843d540a139eb8070bcff8aa10089164436deb Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 18 Mar 2018 12:57:20 -0700
Subject: bpf: sockmap, add msg_cork_bytes() helper

In the case where we need a specific number of bytes before a
verdict can be assigned, even if the data spans multiple sendmsg
or sendfile calls. The BPF program may use msg_cork_bytes().

The extreme case is a user can call sendmsg repeatedly with
1-byte msg segments. Obviously, this is bad for performance but
is still valid. If the BPF program needs N bytes to validate
a header it can use msg_cork_bytes to specify N bytes and the
BPF program will not be called again until N bytes have been
accumulated. The infrastructure will attempt to coalesce data
if possible so in many cases (most my use cases at least) the
data will be in a single scatterlist element with data pointers
pointing to start/end of the element. However, this is dependent
on available memory so is not guaranteed. So BPF programs must
validate data pointer ranges, but this is the case anyways to
convince the verifier the accesses are valid.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 17d6775f5431..0c9daf6ee555 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1942,6 +1942,20 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+{
+	msg->cork_bytes = bytes;
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
+	.func           = bpf_msg_cork_bytes,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -3650,6 +3664,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
 		return &bpf_msg_redirect_map_proto;
 	case BPF_FUNC_msg_apply_bytes:
 		return &bpf_msg_apply_bytes_proto;
+	case BPF_FUNC_msg_cork_bytes:
+		return &bpf_msg_cork_bytes_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 015632bb30daaaee64e1bcac07570860e0bf3092 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 18 Mar 2018 12:57:25 -0700
Subject: bpf: sk_msg program helper bpf_sk_msg_pull_data

Currently, if a bpf sk msg program is run the program
can only parse data that the (start,end) pointers already
consumed. For sendmsg hooks this is likely the first
scatterlist element. For sendpage this will be the range
(0,0) because the data is shared with userspace and by
default we want to avoid allowing userspace to modify
data while (or after) BPF verdict is being decided.

To support pulling in additional bytes for parsing use
a new helper bpf_sk_msg_pull(start, end, flags) which
works similar to cls tc logic. This helper will attempt
to point the data start pointer at 'start' bytes offest
into msg and data end pointer at 'end' bytes offset into
message.

After basic sanity checks to ensure 'start' <= 'end' and
'end' <= msg_length there are a few cases we need to
handle.

First the sendmsg hook has already copied the data from
userspace and has exclusive access to it. Therefor, it
is not necessesary to copy the data. However, it may
be required. After finding the scatterlist element with
'start' offset byte in it there are two cases. One the
range (start,end) is entirely contained in the sg element
and is already linear. All that is needed is to update the
data pointers, no allocate/copy is needed. The other case
is (start, end) crosses sg element boundaries. In this
case we allocate a block of size 'end - start' and copy
the data to linearize it.

Next sendpage hook has not copied any data in initial
state so that data pointers are (0,0). In this case we
handle it similar to the above sendmsg case except the
allocation/copy must always happen. Then when sending
the data we have possibly three memory regions that
need to be sent, (0, start - 1), (start, end), and
(end + 1, msg_length). This is required to ensure any
writes by the BPF program are correctly transmitted.

Lastly this operation will invalidate any previous
data checks so BPF programs will have to revalidate
pointers after making this BPF call.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 135 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 134 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 0c9daf6ee555..c86f03fd9ea5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1956,6 +1956,136 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_msg_pull_data,
+	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+{
+	unsigned int len = 0, offset = 0, copy = 0;
+	struct scatterlist *sg = msg->sg_data;
+	int first_sg, last_sg, i, shift;
+	unsigned char *p, *to, *from;
+	int bytes = end - start;
+	struct page *page;
+
+	if (unlikely(flags || end <= start))
+		return -EINVAL;
+
+	/* First find the starting scatterlist element */
+	i = msg->sg_start;
+	do {
+		len = sg[i].length;
+		offset += len;
+		if (start < offset + len)
+			break;
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (i != msg->sg_end);
+
+	if (unlikely(start >= offset + len))
+		return -EINVAL;
+
+	if (!msg->sg_copy[i] && bytes <= len)
+		goto out;
+
+	first_sg = i;
+
+	/* At this point we need to linearize multiple scatterlist
+	 * elements or a single shared page. Either way we need to
+	 * copy into a linear buffer exclusively owned by BPF. Then
+	 * place the buffer in the scatterlist and fixup the original
+	 * entries by removing the entries now in the linear buffer
+	 * and shifting the remaining entries. For now we do not try
+	 * to copy partial entries to avoid complexity of running out
+	 * of sg_entry slots. The downside is reading a single byte
+	 * will copy the entire sg entry.
+	 */
+	do {
+		copy += sg[i].length;
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+		if (bytes < copy)
+			break;
+	} while (i != msg->sg_end);
+	last_sg = i;
+
+	if (unlikely(copy < end - start))
+		return -EINVAL;
+
+	page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy));
+	if (unlikely(!page))
+		return -ENOMEM;
+	p = page_address(page);
+	offset = 0;
+
+	i = first_sg;
+	do {
+		from = sg_virt(&sg[i]);
+		len = sg[i].length;
+		to = p + offset;
+
+		memcpy(to, from, len);
+		offset += len;
+		sg[i].length = 0;
+		put_page(sg_page(&sg[i]));
+
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (i != last_sg);
+
+	sg[first_sg].length = copy;
+	sg_set_page(&sg[first_sg], page, copy, 0);
+
+	/* To repair sg ring we need to shift entries. If we only
+	 * had a single entry though we can just replace it and
+	 * be done. Otherwise walk the ring and shift the entries.
+	 */
+	shift = last_sg - first_sg - 1;
+	if (!shift)
+		goto out;
+
+	i = first_sg + 1;
+	do {
+		int move_from;
+
+		if (i + shift >= MAX_SKB_FRAGS)
+			move_from = i + shift - MAX_SKB_FRAGS;
+		else
+			move_from = i + shift;
+
+		if (move_from == msg->sg_end)
+			break;
+
+		sg[i] = sg[move_from];
+		sg[move_from].length = 0;
+		sg[move_from].page_link = 0;
+		sg[move_from].offset = 0;
+
+		i++;
+		if (i == MAX_SKB_FRAGS)
+			i = 0;
+	} while (1);
+	msg->sg_end -= shift;
+	if (msg->sg_end < 0)
+		msg->sg_end += MAX_SKB_FRAGS;
+out:
+	msg->data = sg_virt(&sg[i]) + start - offset;
+	msg->data_end = msg->data + bytes;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pull_data_proto = {
+	.func		= bpf_msg_pull_data,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
 {
 	return task_get_classid(skb);
@@ -2897,7 +3027,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
 	    func == bpf_xdp_adjust_head ||
-	    func == bpf_xdp_adjust_meta)
+	    func == bpf_xdp_adjust_meta ||
+	    func == bpf_msg_pull_data)
 		return true;
 
 	return false;
@@ -3666,6 +3797,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
 		return &bpf_msg_apply_bytes_proto;
 	case BPF_FUNC_msg_cork_bytes:
 		return &bpf_msg_cork_bytes_proto;
+	case BPF_FUNC_msg_pull_data:
+		return &bpf_msg_pull_data_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 6aec208786c2a54cbf6135a0242b224e845bef98 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Sun, 4 Mar 2018 15:29:51 -0800
Subject: netfilter: Refactor nf_conncount

Remove parameter 'family' in nf_conncount_count() and count_tree().
It is because the parameter is not useful after commit 625c556118f3
("netfilter: connlimit: split xt_connlimit into front and backend").

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 4 +---
 net/netfilter/xt_connlimit.c | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 6d65389e308f..9305a08b4422 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -158,7 +158,6 @@ static void tree_nodes_free(struct rb_root *root,
 static unsigned int
 count_tree(struct net *net, struct rb_root *root,
 	   const u32 *key, u8 keylen,
-	   u8 family,
 	   const struct nf_conntrack_tuple *tuple,
 	   const struct nf_conntrack_zone *zone)
 {
@@ -246,7 +245,6 @@ count_tree(struct net *net, struct rb_root *root,
 unsigned int nf_conncount_count(struct net *net,
 				struct nf_conncount_data *data,
 				const u32 *key,
-				unsigned int family,
 				const struct nf_conntrack_tuple *tuple,
 				const struct nf_conntrack_zone *zone)
 {
@@ -259,7 +257,7 @@ unsigned int nf_conncount_count(struct net *net,
 
 	spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 
-	count = count_tree(net, root, key, data->keylen, family, tuple, zone);
+	count = count_tree(net, root, key, data->keylen, tuple, zone);
 
 	spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
 
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index b1b17b9353e1..6275106ccf50 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -67,8 +67,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		key[1] = zone->id;
 	}
 
-	connections = nf_conncount_count(net, info->data, key,
-					 xt_family(par), tuple_ptr, zone);
+	connections = nf_conncount_count(net, info->data, key, tuple_ptr,
+					 zone);
 	if (connections == 0)
 		/* kmalloc failed, drop it entirely */
 		goto hotdrop;
-- 
cgit v1.2.3


From 35d8deb80c30fdb2dee3e2dac71eab00d8a6fed5 Mon Sep 17 00:00:00 2001
From: Yi-Hung Wei <yihung.wei@gmail.com>
Date: Sun, 4 Mar 2018 15:29:52 -0800
Subject: netfilter: conncount: Support count only use case

Currently, nf_conncount_count() counts the number of connections that
matches key and inserts a conntrack 'tuple' with the same key into the
accounting data structure.  This patch supports another use case that only
counts the number of connections where 'tuple' is not provided.  Therefore,
proper changes are made on nf_conncount_count() to support the case where
'tuple' is NULL.  This could be useful for querying statistics or
debugging purpose.

Signed-off-by: Yi-Hung Wei <yihung.wei@gmail.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conncount.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 9305a08b4422..153e690e2893 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -104,7 +104,7 @@ static unsigned int check_hlist(struct net *net,
 	struct nf_conn *found_ct;
 	unsigned int length = 0;
 
-	*addit = true;
+	*addit = tuple ? true : false;
 
 	/* check the saved connections */
 	hlist_for_each_entry_safe(conn, n, head, node) {
@@ -117,7 +117,7 @@ static unsigned int check_hlist(struct net *net,
 
 		found_ct = nf_ct_tuplehash_to_ctrack(found);
 
-		if (nf_ct_tuple_equal(&conn->tuple, tuple)) {
+		if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple)) {
 			/*
 			 * Just to be sure we have it only once in the list.
 			 * We should not see tuples twice unless someone hooks
@@ -220,6 +220,9 @@ count_tree(struct net *net, struct rb_root *root,
 		goto restart;
 	}
 
+	if (!tuple)
+		return 0;
+
 	/* no match, need to insert new node */
 	rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
 	if (rbconn == NULL)
@@ -242,6 +245,9 @@ count_tree(struct net *net, struct rb_root *root,
 	return 1;
 }
 
+/* Count and return number of conntrack entries in 'net' with particular 'key'.
+ * If 'tuple' is not null, insert it into the accounting data structure.
+ */
 unsigned int nf_conncount_count(struct net *net,
 				struct nf_conncount_data *data,
 				const u32 *key,
-- 
cgit v1.2.3


From d719e3f21cf91d3f82bd827d46199ba41af2f73a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 9 Mar 2018 11:57:20 +0100
Subject: netfilter: nft_ct: add NFT_CT_{SRC,DST}_{IP,IP6}

All existing keys, except the NFT_CT_SRC and NFT_CT_DST are assumed to
have strict datatypes. This is causing problems with sets and
concatenations given the specific length of these keys is not known.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_ct.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 6ab274b14484..ea737fd789e8 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -196,6 +196,26 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
 	case NFT_CT_PROTO_DST:
 		nft_reg_store16(dest, (__force u16)tuple->dst.u.all);
 		return;
+	case NFT_CT_SRC_IP:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV4)
+			goto err;
+		*dest = tuple->src.u3.ip;
+		return;
+	case NFT_CT_DST_IP:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV4)
+			goto err;
+		*dest = tuple->dst.u3.ip;
+		return;
+	case NFT_CT_SRC_IP6:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV6)
+			goto err;
+		memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr));
+		return;
+	case NFT_CT_DST_IP6:
+		if (nf_ct_l3num(ct) != NFPROTO_IPV6)
+			goto err;
+		memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr));
+		return;
 	default:
 		break;
 	}
@@ -419,6 +439,20 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 			return -EAFNOSUPPORT;
 		}
 		break;
+	case NFT_CT_SRC_IP:
+	case NFT_CT_DST_IP:
+		if (tb[NFTA_CT_DIRECTION] == NULL)
+			return -EINVAL;
+
+		len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip);
+		break;
+	case NFT_CT_SRC_IP6:
+	case NFT_CT_DST_IP6:
+		if (tb[NFTA_CT_DIRECTION] == NULL)
+			return -EINVAL;
+
+		len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u3.ip6);
+		break;
 	case NFT_CT_PROTO_SRC:
 	case NFT_CT_PROTO_DST:
 		if (tb[NFTA_CT_DIRECTION] == NULL)
@@ -588,6 +622,10 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	switch (priv->key) {
 	case NFT_CT_SRC:
 	case NFT_CT_DST:
+	case NFT_CT_SRC_IP:
+	case NFT_CT_DST_IP:
+	case NFT_CT_SRC_IP6:
+	case NFT_CT_DST_IP6:
 	case NFT_CT_PROTO_SRC:
 	case NFT_CT_PROTO_DST:
 		if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
-- 
cgit v1.2.3


From 8039ab43eeac029a9c47c0411918ea82c9ce87cd Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 12 Mar 2018 18:14:42 -0500
Subject: netfilter: cttimeout: remove VLA usage

In preparation to enabling -Wvla, remove VLA and replace it
with dynamic memory allocation.

>From a security viewpoint, the use of Variable Length Arrays can be
a vector for stack overflow attacks. Also, in general, as the code
evolves it is easy to lose track of how big a VLA can get. Thus, we
can end up having segfaults that are hard to debug.

Also, fixed as part of the directive to remove all VLAs from
the kernel: https://lkml.org/lkml/2018/3/7/621

While at it, remove likely() notation which is not necessary from the
control plane code.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cttimeout.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 95b04702a655..9ee5fa551fa6 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -51,19 +51,27 @@ ctnl_timeout_parse_policy(void *timeouts,
 			  const struct nf_conntrack_l4proto *l4proto,
 			  struct net *net, const struct nlattr *attr)
 {
+	struct nlattr **tb;
 	int ret = 0;
 
-	if (likely(l4proto->ctnl_timeout.nlattr_to_obj)) {
-		struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1];
+	if (!l4proto->ctnl_timeout.nlattr_to_obj)
+		return 0;
 
-		ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max,
-				       attr, l4proto->ctnl_timeout.nla_policy,
-				       NULL);
-		if (ret < 0)
-			return ret;
+	tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb),
+		     GFP_KERNEL);
 
-		ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
-	}
+	if (!tb)
+		return -ENOMEM;
+
+	ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, attr,
+			       l4proto->ctnl_timeout.nla_policy, NULL);
+	if (ret < 0)
+		goto err;
+
+	ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+
+err:
+	kfree(tb);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1446385904add0e89f990ee0518434365e50ce86 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 12 Mar 2018 19:21:38 -0500
Subject: netfilter: nfnetlink_cthelper: Remove VLA usage

In preparation to enabling -Wvla, remove VLA and replace it
with dynamic memory allocation.

>From a security viewpoint, the use of Variable Length Arrays can be
a vector for stack overflow attacks. Also, in general, as the code
evolves it is easy to lose track of how big a VLA can get. Thus, we
can end up having segfaults that are hard to debug.

Also, fixed as part of the directive to remove all VLAs from
the kernel: https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_cthelper.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index d33ce6d5ebce..4a4b293fb2e5 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -314,23 +314,30 @@ nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy,
 static int nfnl_cthelper_update_policy_all(struct nlattr *tb[],
 					   struct nf_conntrack_helper *helper)
 {
-	struct nf_conntrack_expect_policy new_policy[helper->expect_class_max + 1];
+	struct nf_conntrack_expect_policy *new_policy;
 	struct nf_conntrack_expect_policy *policy;
-	int i, err;
+	int i, ret = 0;
+
+	new_policy = kmalloc_array(helper->expect_class_max + 1,
+				   sizeof(*new_policy), GFP_KERNEL);
+	if (!new_policy)
+		return -ENOMEM;
 
 	/* Check first that all policy attributes are well-formed, so we don't
 	 * leave things in inconsistent state on errors.
 	 */
 	for (i = 0; i < helper->expect_class_max + 1; i++) {
 
-		if (!tb[NFCTH_POLICY_SET + i])
-			return -EINVAL;
+		if (!tb[NFCTH_POLICY_SET + i]) {
+			ret = -EINVAL;
+			goto err;
+		}
 
-		err = nfnl_cthelper_update_policy_one(&helper->expect_policy[i],
+		ret = nfnl_cthelper_update_policy_one(&helper->expect_policy[i],
 						      &new_policy[i],
 						      tb[NFCTH_POLICY_SET + i]);
-		if (err < 0)
-			return err;
+		if (ret < 0)
+			goto err;
 	}
 	/* Now we can safely update them. */
 	for (i = 0; i < helper->expect_class_max + 1; i++) {
@@ -340,7 +347,9 @@ static int nfnl_cthelper_update_policy_all(struct nlattr *tb[],
 		policy->timeout	= new_policy->timeout;
 	}
 
-	return 0;
+err:
+	kfree(new_policy);
+	return ret;
 }
 
 static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper,
-- 
cgit v1.2.3


From 5b4c6e3860daaf089c28e0161dffef7b5ad8000f Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Mon, 12 Mar 2018 22:16:17 -0500
Subject: netfilter: nf_tables: remove VLA usage

In preparation to enabling -Wvla, remove VLA and replace it
with dynamic memory allocation.

>From a security viewpoint, the use of Variable Length Arrays can be
a vector for stack overflow attacks. Also, in general, as the code
evolves it is easy to lose track of how big a VLA can get. Thus, we
can end up having segfaults that are hard to debug.

Also, fixed as part of the directive to remove all VLAs from
the kernel: https://lkml.org/lkml/2018/3/7/621

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8cc7fc970f0c..92f5606b0dea 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4357,16 +4357,20 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 				       const struct nft_object_type *type,
 				       const struct nlattr *attr)
 {
-	struct nlattr *tb[type->maxattr + 1];
+	struct nlattr **tb;
 	const struct nft_object_ops *ops;
 	struct nft_object *obj;
-	int err;
+	int err = -ENOMEM;
+
+	tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL);
+	if (!tb)
+		goto err1;
 
 	if (attr) {
 		err = nla_parse_nested(tb, type->maxattr, attr, type->policy,
 				       NULL);
 		if (err < 0)
-			goto err1;
+			goto err2;
 	} else {
 		memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
 	}
@@ -4375,7 +4379,7 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 		ops = type->select_ops(ctx, (const struct nlattr * const *)tb);
 		if (IS_ERR(ops)) {
 			err = PTR_ERR(ops);
-			goto err1;
+			goto err2;
 		}
 	} else {
 		ops = type->ops;
@@ -4383,18 +4387,21 @@ static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
 
 	err = -ENOMEM;
 	obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL);
-	if (obj == NULL)
-		goto err1;
+	if (!obj)
+		goto err2;
 
 	err = ops->init(ctx, (const struct nlattr * const *)tb, obj);
 	if (err < 0)
-		goto err2;
+		goto err3;
 
 	obj->ops = ops;
 
+	kfree(tb);
 	return obj;
-err2:
+err3:
 	kfree(obj);
+err2:
+	kfree(tb);
 err1:
 	return ERR_PTR(err);
 }
-- 
cgit v1.2.3


From d72133e6288030121e425b89584ab3dfb68871cc Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Wed, 14 Mar 2018 23:36:53 +0900
Subject: netfilter: ebtables: use ADD_COUNTER macro

xtables uses ADD_COUNTER macro to increase
packet and byte count. ebtables also can use this.

Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 217aa79f7b2a..9a26d2b7420f 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -223,9 +223,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
 			return NF_DROP;
 		}
 
-		/* increase counter */
-		(*(counter_base + i)).pcnt++;
-		(*(counter_base + i)).bcnt += skb->len;
+		ADD_COUNTER(*(counter_base + i), 1, skb->len);
 
 		/* these should only watch: not modify, nor tell us
 		 * what to do with the packet
@@ -968,10 +966,9 @@ static void get_counters(const struct ebt_counter *oldcounters,
 		if (cpu == 0)
 			continue;
 		counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
-		for (i = 0; i < nentries; i++) {
-			counters[i].pcnt += counter_base[i].pcnt;
-			counters[i].bcnt += counter_base[i].bcnt;
-		}
+		for (i = 0; i < nentries; i++)
+			ADD_COUNTER(counters[i], counter_base[i].pcnt,
+				    counter_base[i].bcnt);
 	}
 }
 
@@ -1324,10 +1321,8 @@ static int do_update_counters(struct net *net, const char *name,
 	write_lock_bh(&t->lock);
 
 	/* we add to the counters of the first cpu */
-	for (i = 0; i < num_counters; i++) {
-		t->private->counters[i].pcnt += tmp[i].pcnt;
-		t->private->counters[i].bcnt += tmp[i].bcnt;
-	}
+	for (i = 0; i < num_counters; i++)
+		ADD_COUNTER(t->private->counters[i], tmp[i].pcnt, tmp[i].bcnt);
 
 	write_unlock_bh(&t->lock);
 	ret = 0;
-- 
cgit v1.2.3


From 472a73e00757b971d613d796374d2727b2e4954d Mon Sep 17 00:00:00 2001
From: Jack Ma <jack.ma@alliedtelesis.co.nz>
Date: Mon, 19 Mar 2018 09:41:59 +1300
Subject: netfilter: xt_conntrack: Support bit-shifting for CONNMARK & MARK
 targets.

This patch introduces a new feature that allows bitshifting (left
and right) operations to co-operate with existing iptables options.

Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Jack Ma <jack.ma@alliedtelesis.co.nz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_connmark.c | 77 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 60 insertions(+), 17 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c
index 809639ce6f5a..773da82190dc 100644
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -36,9 +36,10 @@ MODULE_ALIAS("ipt_connmark");
 MODULE_ALIAS("ip6t_connmark");
 
 static unsigned int
-connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+connmark_tg_shift(struct sk_buff *skb,
+		const struct xt_connmark_tginfo1 *info,
+		u8 shift_bits, u8 shift_dir)
 {
-	const struct xt_connmark_tginfo1 *info = par->targinfo;
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
 	u_int32_t newmark;
@@ -50,6 +51,10 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	switch (info->mode) {
 	case XT_CONNMARK_SET:
 		newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
+		if (shift_dir == D_SHIFT_RIGHT)
+			newmark >>= shift_bits;
+		else
+			newmark <<= shift_bits;
 		if (ct->mark != newmark) {
 			ct->mark = newmark;
 			nf_conntrack_event_cache(IPCT_MARK, ct);
@@ -57,7 +62,11 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		break;
 	case XT_CONNMARK_SAVE:
 		newmark = (ct->mark & ~info->ctmask) ^
-		          (skb->mark & info->nfmask);
+			  (skb->mark & info->nfmask);
+		if (shift_dir == D_SHIFT_RIGHT)
+			newmark >>= shift_bits;
+		else
+			newmark <<= shift_bits;
 		if (ct->mark != newmark) {
 			ct->mark = newmark;
 			nf_conntrack_event_cache(IPCT_MARK, ct);
@@ -65,14 +74,34 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 		break;
 	case XT_CONNMARK_RESTORE:
 		newmark = (skb->mark & ~info->nfmask) ^
-		          (ct->mark & info->ctmask);
+			  (ct->mark & info->ctmask);
+		if (shift_dir == D_SHIFT_RIGHT)
+			newmark >>= shift_bits;
+		else
+			newmark <<= shift_bits;
 		skb->mark = newmark;
 		break;
 	}
-
 	return XT_CONTINUE;
 }
 
+static unsigned int
+connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_connmark_tginfo1 *info = par->targinfo;
+
+	return connmark_tg_shift(skb, info, 0, 0);
+}
+
+static unsigned int
+connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_connmark_tginfo2 *info = par->targinfo;
+
+	return connmark_tg_shift(skb, (const struct xt_connmark_tginfo1 *)info,
+				 info->shift_bits, info->shift_dir);
+}
+
 static int connmark_tg_check(const struct xt_tgchk_param *par)
 {
 	int ret;
@@ -119,15 +148,27 @@ static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
 	nf_ct_netns_put(par->net, par->family);
 }
 
-static struct xt_target connmark_tg_reg __read_mostly = {
-	.name           = "CONNMARK",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.checkentry     = connmark_tg_check,
-	.target         = connmark_tg,
-	.targetsize     = sizeof(struct xt_connmark_tginfo1),
-	.destroy        = connmark_tg_destroy,
-	.me             = THIS_MODULE,
+static struct xt_target connmark_tg_reg[] __read_mostly = {
+	{
+		.name           = "CONNMARK",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_tg_check,
+		.target         = connmark_tg,
+		.targetsize     = sizeof(struct xt_connmark_tginfo1),
+		.destroy        = connmark_tg_destroy,
+		.me             = THIS_MODULE,
+	},
+	{
+		.name           = "CONNMARK",
+		.revision       = 2,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_tg_check,
+		.target         = connmark_tg_v2,
+		.targetsize     = sizeof(struct xt_connmark_tginfo2),
+		.destroy        = connmark_tg_destroy,
+		.me             = THIS_MODULE,
+	}
 };
 
 static struct xt_match connmark_mt_reg __read_mostly = {
@@ -145,12 +186,14 @@ static int __init connmark_mt_init(void)
 {
 	int ret;
 
-	ret = xt_register_target(&connmark_tg_reg);
+	ret = xt_register_targets(connmark_tg_reg,
+				  ARRAY_SIZE(connmark_tg_reg));
 	if (ret < 0)
 		return ret;
 	ret = xt_register_match(&connmark_mt_reg);
 	if (ret < 0) {
-		xt_unregister_target(&connmark_tg_reg);
+		xt_unregister_targets(connmark_tg_reg,
+				      ARRAY_SIZE(connmark_tg_reg));
 		return ret;
 	}
 	return 0;
@@ -159,7 +202,7 @@ static int __init connmark_mt_init(void)
 static void __exit connmark_mt_exit(void)
 {
 	xt_unregister_match(&connmark_mt_reg);
-	xt_unregister_target(&connmark_tg_reg);
+	xt_unregister_target(connmark_tg_reg);
 }
 
 module_init(connmark_mt_init);
-- 
cgit v1.2.3


From 5191d70f83fd1878c40029cffe69f6a2bf65fa0e Mon Sep 17 00:00:00 2001
From: Arushi Singhal <arushisinghal19971997@gmail.com>
Date: Mon, 12 Mar 2018 18:36:29 +0530
Subject: netfilter: Replace printk() with pr_*() and define pr_fmt()

Using pr_<loglevel>() is more concise than printk(KERN_<LOGLEVEL>).
This patch:
* Replace printks having a log level with the appropriate
pr_*() macros.
* Define pr_fmt() to include relevant name.
* Remove redundant prefixes from pr_*() calls.
* Indent the code where possible.
* Remove the useless output messages.
* Remove periods from messages.

Signed-off-by: Arushi Singhal <arushisinghal19971997@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_acct.c      |  6 ++++--
 net/netfilter/nf_conntrack_ecache.c    |  6 ++++--
 net/netfilter/nf_conntrack_timestamp.c |  6 ++++--
 net/netfilter/nf_nat_core.c            |  4 +++-
 net/netfilter/nf_nat_ftp.c             |  7 ++++---
 net/netfilter/nf_nat_irc.c             |  7 ++++---
 net/netfilter/nfnetlink_queue.c        | 14 +++++++-------
 net/netfilter/xt_time.c                | 13 +++++++------
 8 files changed, 37 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c
index 866916712905..1d66de5151b2 100644
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -8,6 +8,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/netfilter.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -80,7 +82,7 @@ static int nf_conntrack_acct_init_sysctl(struct net *net)
 	net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter",
 							 table);
 	if (!net->ct.acct_sysctl_header) {
-		printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n");
+		pr_err("can't register to sysctl\n");
 		goto out_register;
 	}
 	return 0;
@@ -125,7 +127,7 @@ int nf_conntrack_acct_init(void)
 {
 	int ret = nf_ct_extend_register(&acct_extend);
 	if (ret < 0)
-		pr_err("nf_conntrack_acct: Unable to register extension\n");
+		pr_err("Unable to register extension\n");
 	return ret;
 }
 
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index caac41ad9483..c11822a7d2bf 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -11,6 +11,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/types.h>
 #include <linux/netfilter.h>
 #include <linux/skbuff.h>
@@ -372,7 +374,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net)
 	net->ct.event_sysctl_header =
 		register_net_sysctl(net, "net/netfilter", table);
 	if (!net->ct.event_sysctl_header) {
-		printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
+		pr_err("can't register to sysctl\n");
 		goto out_register;
 	}
 	return 0;
@@ -419,7 +421,7 @@ int nf_conntrack_ecache_init(void)
 {
 	int ret = nf_ct_extend_register(&event_extend);
 	if (ret < 0)
-		pr_err("nf_ct_event: Unable to register event extension.\n");
+		pr_err("Unable to register event extension\n");
 
 	BUILD_BUG_ON(__IPCT_MAX >= 16);	/* ctmask, missed use u16 */
 
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
index 4c4734b78318..56766cb26e40 100644
--- a/net/netfilter/nf_conntrack_timestamp.c
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -6,6 +6,8 @@
  * published by the Free Software Foundation (or any later at your option).
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/netfilter.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -58,7 +60,7 @@ static int nf_conntrack_tstamp_init_sysctl(struct net *net)
 	net->ct.tstamp_sysctl_header = register_net_sysctl(net,	"net/netfilter",
 							   table);
 	if (!net->ct.tstamp_sysctl_header) {
-		printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
+		pr_err("can't register to sysctl\n");
 		goto out_register;
 	}
 	return 0;
@@ -104,7 +106,7 @@ int nf_conntrack_tstamp_init(void)
 	int ret;
 	ret = nf_ct_extend_register(&tstamp_extend);
 	if (ret < 0)
-		pr_err("nf_ct_tstamp: Unable to register extension\n");
+		pr_err("Unable to register extension\n");
 	return ret;
 }
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 6c38421e31f9..617693ff9f4c 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -8,6 +8,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/timer.h>
@@ -814,7 +816,7 @@ static int __init nf_nat_init(void)
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
 		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
-		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
+		pr_err("Unable to register extension\n");
 		return ret;
 	}
 
diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c
index d76afafdc699..5063cbf1689c 100644
--- a/net/netfilter/nf_nat_ftp.c
+++ b/net/netfilter/nf_nat_ftp.c
@@ -8,6 +8,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/inet.h>
@@ -71,7 +73,7 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
 	char buffer[sizeof("|1||65535|") + INET6_ADDRSTRLEN];
 	unsigned int buflen;
 
-	pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+	pr_debug("type %i, off %u len %u\n", type, matchoff, matchlen);
 
 	/* Connection will come from wherever this packet goes, hence !dir */
 	newaddr = ct->tuplehash[!dir].tuple.dst.u3;
@@ -136,8 +138,7 @@ static int __init nf_nat_ftp_init(void)
 /* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
 static int warn_set(const char *val, const struct kernel_param *kp)
 {
-	printk(KERN_INFO KBUILD_MODNAME
-	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+	pr_info("kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
 	return 0;
 }
 module_param_call(ports, warn_set, NULL, NULL, 0);
diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c
index dcb5f6375d9d..3aa35a43100d 100644
--- a/net/netfilter/nf_nat_irc.c
+++ b/net/netfilter/nf_nat_irc.c
@@ -10,6 +10,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/tcp.h>
@@ -79,7 +81,7 @@ static unsigned int help(struct sk_buff *skb,
 	 */
 	/* AAA = "us", ie. where server normally talks to. */
 	snprintf(buffer, sizeof(buffer), "%u %u", ntohl(newaddr.ip), port);
-	pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n",
+	pr_debug("inserting '%s' == %pI4, port %u\n",
 		 buffer, &newaddr.ip, port);
 
 	if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff,
@@ -108,8 +110,7 @@ static int __init nf_nat_irc_init(void)
 /* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
 static int warn_set(const char *val, const struct kernel_param *kp)
 {
-	printk(KERN_INFO KBUILD_MODNAME
-	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+	pr_info("kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
 	return 0;
 }
 module_param_call(ports, warn_set, NULL, NULL, 0);
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 8bba23160a68..74a04638ef03 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -14,6 +14,9 @@
  * published by the Free Software Foundation.
  *
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/skbuff.h>
 #include <linux/init.h>
@@ -833,11 +836,8 @@ nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
 		if (diff > skb_tailroom(e->skb)) {
 			nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
 					       diff, GFP_ATOMIC);
-			if (!nskb) {
-				printk(KERN_WARNING "nf_queue: OOM "
-				      "in mangle, dropping packet\n");
+			if (!nskb)
 				return -ENOMEM;
-			}
 			kfree_skb(e->skb);
 			e->skb = nskb;
 		}
@@ -1536,20 +1536,20 @@ static int __init nfnetlink_queue_init(void)
 
 	status = register_pernet_subsys(&nfnl_queue_net_ops);
 	if (status < 0) {
-		pr_err("nf_queue: failed to register pernet ops\n");
+		pr_err("failed to register pernet ops\n");
 		goto out;
 	}
 
 	netlink_register_notifier(&nfqnl_rtnl_notifier);
 	status = nfnetlink_subsys_register(&nfqnl_subsys);
 	if (status < 0) {
-		pr_err("nf_queue: failed to create netlink socket\n");
+		pr_err("failed to create netlink socket\n");
 		goto cleanup_netlink_notifier;
 	}
 
 	status = register_netdevice_notifier(&nfqnl_dev_notifier);
 	if (status < 0) {
-		pr_err("nf_queue: failed to register netdevice notifier\n");
+		pr_err("failed to register netdevice notifier\n");
 		goto cleanup_netlink_subsys;
 	}
 
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 0160f505e337..c13bcd0ab491 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -9,6 +9,9 @@
  *	This file is distributed under the terms of the GNU General Public
  *	License (GPL). Copies of the GPL can be obtained from gnu.org/gpl.
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/ktime.h>
 #include <linux/module.h>
 #include <linux/skbuff.h>
@@ -266,13 +269,11 @@ static int __init time_mt_init(void)
 	int minutes = sys_tz.tz_minuteswest;
 
 	if (minutes < 0) /* east of Greenwich */
-		printk(KERN_INFO KBUILD_MODNAME
-		       ": kernel timezone is +%02d%02d\n",
-		       -minutes / 60, -minutes % 60);
+		pr_info("kernel timezone is +%02d%02d\n",
+			-minutes / 60, -minutes % 60);
 	else /* west of Greenwich */
-		printk(KERN_INFO KBUILD_MODNAME
-		       ": kernel timezone is -%02d%02d\n",
-		       minutes / 60, minutes % 60);
+		pr_info("kernel timezone is -%02d%02d\n",
+			minutes / 60, minutes % 60);
 
 	return xt_register_match(&xt_time_mt_reg);
 }
-- 
cgit v1.2.3


From 20710b3b81895c89e92bcc32ce85c0bede1171f8 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 20 Mar 2018 12:33:51 +0100
Subject: netfilter: ctnetlink: synproxy support

This patch exposes synproxy information per-conntrack. Moreover, send
sequence adjustment events once server sends us the SYN,ACK packet, so
we can synchronize the sequence adjustment too for packets going as
reply from the server, as part of the synproxy logic.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/ipt_SYNPROXY.c    |  8 +++-
 net/ipv6/netfilter/ip6t_SYNPROXY.c   |  8 +++-
 net/netfilter/nf_conntrack_netlink.c | 87 ++++++++++++++++++++++++++++++++++--
 3 files changed, 98 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index f75fc6b53115..690b17ef6a44 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -16,6 +16,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
 
 static struct iphdr *
 synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
@@ -384,6 +385,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
 		synproxy->isn = ntohl(th->ack_seq);
 		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
 			synproxy->its = opts.tsecr;
+
+		nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
 		break;
 	case TCP_CONNTRACK_SYN_RECV:
 		if (!th->syn || !th->ack)
@@ -392,8 +395,10 @@ static unsigned int ipv4_synproxy_hook(void *priv,
 		if (!synproxy_parse_options(skb, thoff, th, &opts))
 			return NF_DROP;
 
-		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
 			synproxy->tsoff = opts.tsval - synproxy->its;
+			nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+		}
 
 		opts.options &= ~(XT_SYNPROXY_OPT_MSS |
 				  XT_SYNPROXY_OPT_WSCALE |
@@ -403,6 +408,7 @@ static unsigned int ipv4_synproxy_hook(void *priv,
 		synproxy_send_server_ack(net, state, skb, th, &opts);
 
 		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+		nf_conntrack_event_cache(IPCT_SEQADJ, ct);
 
 		swap(opts.tsval, opts.tsecr);
 		synproxy_send_client_ack(net, skb, th, &opts);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 437af8c95277..cb6d42b03cb5 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -18,6 +18,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
 
 static struct ipv6hdr *
 synproxy_build_ip(struct net *net, struct sk_buff *skb,
@@ -405,6 +406,8 @@ static unsigned int ipv6_synproxy_hook(void *priv,
 		synproxy->isn = ntohl(th->ack_seq);
 		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
 			synproxy->its = opts.tsecr;
+
+		nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
 		break;
 	case TCP_CONNTRACK_SYN_RECV:
 		if (!th->syn || !th->ack)
@@ -413,8 +416,10 @@ static unsigned int ipv6_synproxy_hook(void *priv,
 		if (!synproxy_parse_options(skb, thoff, th, &opts))
 			return NF_DROP;
 
-		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
+		if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
 			synproxy->tsoff = opts.tsval - synproxy->its;
+			nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
+		}
 
 		opts.options &= ~(XT_SYNPROXY_OPT_MSS |
 				  XT_SYNPROXY_OPT_WSCALE |
@@ -424,6 +429,7 @@ static unsigned int ipv6_synproxy_hook(void *priv,
 		synproxy_send_server_ack(net, state, skb, th, &opts);
 
 		nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
+		nf_conntrack_event_cache(IPCT_SEQADJ, ct);
 
 		swap(opts.tsval, opts.tsecr);
 		synproxy_send_client_ack(net, skb, th, &opts);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 8884d302d33a..11ef85a57244 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -440,6 +440,31 @@ err:
 	return -1;
 }
 
+static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct)
+{
+	struct nf_conn_synproxy *synproxy = nfct_synproxy(ct);
+	struct nlattr *nest_parms;
+
+	if (!synproxy)
+		return 0;
+
+	nest_parms = nla_nest_start(skb, CTA_SYNPROXY | NLA_F_NESTED);
+	if (!nest_parms)
+		goto nla_put_failure;
+
+	if (nla_put_be32(skb, CTA_SYNPROXY_ISN, htonl(synproxy->isn)) ||
+	    nla_put_be32(skb, CTA_SYNPROXY_ITS, htonl(synproxy->its)) ||
+	    nla_put_be32(skb, CTA_SYNPROXY_TSOFF, htonl(synproxy->tsoff)))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest_parms);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
 static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
 {
 	if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct)))
@@ -518,7 +543,8 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	    ctnetlink_dump_id(skb, ct) < 0 ||
 	    ctnetlink_dump_use(skb, ct) < 0 ||
 	    ctnetlink_dump_master(skb, ct) < 0 ||
-	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
+	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0 ||
+	    ctnetlink_dump_ct_synproxy(skb, ct) < 0)
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -730,6 +756,10 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
 		if (events & (1 << IPCT_SEQADJ) &&
 		    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 			goto nla_put_failure;
+
+		if (events & (1 << IPCT_SYNPROXY) &&
+		    ctnetlink_dump_ct_synproxy(skb, ct) < 0)
+			goto nla_put_failure;
 	}
 
 #ifdef CONFIG_NF_CONNTRACK_MARK
@@ -1689,6 +1719,39 @@ err:
 	return ret;
 }
 
+static const struct nla_policy synproxy_policy[CTA_SYNPROXY_MAX + 1] = {
+	[CTA_SYNPROXY_ISN]	= { .type = NLA_U32 },
+	[CTA_SYNPROXY_ITS]	= { .type = NLA_U32 },
+	[CTA_SYNPROXY_TSOFF]	= { .type = NLA_U32 },
+};
+
+static int ctnetlink_change_synproxy(struct nf_conn *ct,
+				     const struct nlattr * const cda[])
+{
+	struct nf_conn_synproxy *synproxy = nfct_synproxy(ct);
+	struct nlattr *tb[CTA_SYNPROXY_MAX + 1];
+	int err;
+
+	if (!synproxy)
+		return 0;
+
+	err = nla_parse_nested(tb, CTA_SYNPROXY_MAX, cda[CTA_SYNPROXY],
+			       synproxy_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[CTA_SYNPROXY_ISN] ||
+	    !tb[CTA_SYNPROXY_ITS] ||
+	    !tb[CTA_SYNPROXY_TSOFF])
+		return -EINVAL;
+
+	synproxy->isn = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ISN]));
+	synproxy->its = ntohl(nla_get_be32(tb[CTA_SYNPROXY_ITS]));
+	synproxy->tsoff = ntohl(nla_get_be32(tb[CTA_SYNPROXY_TSOFF]));
+
+	return 0;
+}
+
 static int
 ctnetlink_attach_labels(struct nf_conn *ct, const struct nlattr * const cda[])
 {
@@ -1759,6 +1822,12 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
 			return err;
 	}
 
+	if (cda[CTA_SYNPROXY]) {
+		err = ctnetlink_change_synproxy(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
 	if (cda[CTA_LABELS]) {
 		err = ctnetlink_attach_labels(ct, cda);
 		if (err < 0)
@@ -1880,6 +1949,12 @@ ctnetlink_create_conntrack(struct net *net,
 			goto err2;
 	}
 
+	if (cda[CTA_SYNPROXY]) {
+		err = ctnetlink_change_synproxy(ct, cda);
+		if (err < 0)
+			goto err2;
+	}
+
 #if defined(CONFIG_NF_CONNTRACK_MARK)
 	if (cda[CTA_MARK])
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
@@ -1991,7 +2066,9 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 						      (1 << IPCT_HELPER) |
 						      (1 << IPCT_PROTOINFO) |
 						      (1 << IPCT_SEQADJ) |
-						      (1 << IPCT_MARK) | events,
+						      (1 << IPCT_MARK) |
+						      (1 << IPCT_SYNPROXY) |
+						      events,
 						      ct, NETLINK_CB(skb).portid,
 						      nlmsg_report(nlh));
 			nf_ct_put(ct);
@@ -2012,7 +2089,8 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 						      (1 << IPCT_LABEL) |
 						      (1 << IPCT_PROTOINFO) |
 						      (1 << IPCT_SEQADJ) |
-						      (1 << IPCT_MARK),
+						      (1 << IPCT_MARK) |
+						      (1 << IPCT_SYNPROXY),
 						      ct, NETLINK_CB(skb).portid,
 						      nlmsg_report(nlh));
 		}
@@ -2282,6 +2360,9 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct)
 	    ctnetlink_dump_ct_seq_adj(skb, ct) < 0)
 		goto nla_put_failure;
 
+	if (ctnetlink_dump_ct_synproxy(skb, ct) < 0)
+		goto nla_put_failure;
+
 #ifdef CONFIG_NF_CONNTRACK_MARK
 	if (ct->mark && ctnetlink_dump_mark(skb, ct) < 0)
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 5adc1668ddc42bb44fd6d006cacad74ed0cbf49d Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 4 Mar 2018 09:28:53 +0100
Subject: netfilter: ebtables: add support for matching ICMP type and code

We already have ICMPv6 type/code matches. This adds support for IPv4 ICMP
matches in the same way.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_ip.c | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index 2b46c50abce0..8cb8f8395768 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -19,9 +19,15 @@
 #include <linux/netfilter_bridge/ebtables.h>
 #include <linux/netfilter_bridge/ebt_ip.h>
 
-struct tcpudphdr {
-	__be16 src;
-	__be16 dst;
+union pkthdr {
+	struct {
+		__be16 src;
+		__be16 dst;
+	} tcpudphdr;
+	struct {
+		u8 type;
+		u8 code;
+	} icmphdr;
 };
 
 static bool
@@ -30,8 +36,8 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct ebt_ip_info *info = par->matchinfo;
 	const struct iphdr *ih;
 	struct iphdr _iph;
-	const struct tcpudphdr *pptr;
-	struct tcpudphdr _ports;
+	const union pkthdr *pptr;
+	union pkthdr _pkthdr;
 
 	ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
 	if (ih == NULL)
@@ -50,29 +56,38 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	if (info->bitmask & EBT_IP_PROTO) {
 		if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol))
 			return false;
-		if (!(info->bitmask & EBT_IP_DPORT) &&
-		    !(info->bitmask & EBT_IP_SPORT))
+		if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT |
+				       EBT_IP_ICMP)))
 			return true;
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			return false;
+
+		/* min icmp headersize is 4, so sizeof(_pkthdr) is ok. */
 		pptr = skb_header_pointer(skb, ih->ihl*4,
-					  sizeof(_ports), &_ports);
+					  sizeof(_pkthdr), &_pkthdr);
 		if (pptr == NULL)
 			return false;
 		if (info->bitmask & EBT_IP_DPORT) {
-			u32 dst = ntohs(pptr->dst);
+			u32 dst = ntohs(pptr->tcpudphdr.dst);
 			if (NF_INVF(info, EBT_IP_DPORT,
 				    dst < info->dport[0] ||
 				    dst > info->dport[1]))
 				return false;
 		}
 		if (info->bitmask & EBT_IP_SPORT) {
-			u32 src = ntohs(pptr->src);
+			u32 src = ntohs(pptr->tcpudphdr.src);
 			if (NF_INVF(info, EBT_IP_SPORT,
 				    src < info->sport[0] ||
 				    src > info->sport[1]))
 				return false;
 		}
+		if ((info->bitmask & EBT_IP_ICMP) &&
+		    NF_INVF(info, EBT_IP_ICMP,
+			    pptr->icmphdr.type < info->icmp_type[0] ||
+			    pptr->icmphdr.type > info->icmp_type[1] ||
+			    pptr->icmphdr.code < info->icmp_code[0] ||
+			    pptr->icmphdr.code > info->icmp_code[1]))
+			return false;
 	}
 	return true;
 }
@@ -101,6 +116,14 @@ static int ebt_ip_mt_check(const struct xt_mtchk_param *par)
 		return -EINVAL;
 	if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1])
 		return -EINVAL;
+	if (info->bitmask & EBT_IP_ICMP) {
+		if ((info->invflags & EBT_IP_PROTO) ||
+		    info->protocol != IPPROTO_ICMP)
+			return -EINVAL;
+		if (info->icmp_type[0] > info->icmp_type[1] ||
+		    info->icmp_code[0] > info->icmp_code[1])
+			return -EINVAL;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 78d9f4d49bbecd101b4e5faf19f8f70719fee2ca Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 4 Mar 2018 09:28:54 +0100
Subject: netfilter: ebtables: add support for matching IGMP type

We already have ICMPv6 type/code matches (which can be used to distinguish
different types of MLD packets). Add support for IPv4 IGMP matches in the
same way.

Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_ip.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index 8cb8f8395768..ffaa8ce2e724 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -28,6 +28,9 @@ union pkthdr {
 		u8 type;
 		u8 code;
 	} icmphdr;
+	struct {
+		u8 type;
+	} igmphdr;
 };
 
 static bool
@@ -57,12 +60,12 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
 		if (NF_INVF(info, EBT_IP_PROTO, info->protocol != ih->protocol))
 			return false;
 		if (!(info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT |
-				       EBT_IP_ICMP)))
+				       EBT_IP_ICMP | EBT_IP_IGMP)))
 			return true;
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			return false;
 
-		/* min icmp headersize is 4, so sizeof(_pkthdr) is ok. */
+		/* min icmp/igmp headersize is 4, so sizeof(_pkthdr) is ok. */
 		pptr = skb_header_pointer(skb, ih->ihl*4,
 					  sizeof(_pkthdr), &_pkthdr);
 		if (pptr == NULL)
@@ -88,6 +91,11 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param *par)
 			    pptr->icmphdr.code < info->icmp_code[0] ||
 			    pptr->icmphdr.code > info->icmp_code[1]))
 			return false;
+		if ((info->bitmask & EBT_IP_IGMP) &&
+		    NF_INVF(info, EBT_IP_IGMP,
+			    pptr->igmphdr.type < info->igmp_type[0] ||
+			    pptr->igmphdr.type > info->igmp_type[1]))
+			return false;
 	}
 	return true;
 }
@@ -124,6 +132,13 @@ static int ebt_ip_mt_check(const struct xt_mtchk_param *par)
 		    info->icmp_code[0] > info->icmp_code[1])
 			return -EINVAL;
 	}
+	if (info->bitmask & EBT_IP_IGMP) {
+		if ((info->invflags & EBT_IP_PROTO) ||
+		    info->protocol != IPPROTO_IGMP)
+			return -EINVAL;
+		if (info->igmp_type[0] > info->igmp_type[1])
+			return -EINVAL;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 1ad22fb5bb53ce6bf5377f25529c0a007c61c6f5 Mon Sep 17 00:00:00 2001
From: Tosoni <jp.tosoni@acksys.fr>
Date: Wed, 14 Mar 2018 16:58:34 +0000
Subject: mac80211: inform wireless layer when frame RSSI is invalid

When the low-level driver returns an invalid RSSI indication,
set the signal value to 0 as an indication to the upper layer.

Also, skip average level computation if signal is invalid.

Signed-off-by: Jean Pierre TOSONI <jp.tosoni@acksys.fr>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 160 ++++++++++++++++++++++++++++------------------------
 net/mac80211/rx.c   |   6 +-
 net/mac80211/scan.c |   4 +-
 3 files changed, 93 insertions(+), 77 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0024eff9bb84..ea3ba03fb952 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3306,82 +3306,14 @@ static const u64 care_about_ies =
 	(1ULL << WLAN_EID_HT_OPERATION) |
 	(1ULL << WLAN_EID_EXT_CHANSWITCH_ANN);
 
-static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
-				     struct ieee80211_mgmt *mgmt, size_t len,
-				     struct ieee80211_rx_status *rx_status)
+static void ieee80211_handle_beacon_sig(struct ieee80211_sub_if_data *sdata,
+					struct ieee80211_if_managed *ifmgd,
+					struct ieee80211_bss_conf *bss_conf,
+					struct ieee80211_local *local,
+					struct ieee80211_rx_status *rx_status)
 {
-	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
-	size_t baselen;
-	struct ieee802_11_elems elems;
-	struct ieee80211_local *local = sdata->local;
-	struct ieee80211_chanctx_conf *chanctx_conf;
-	struct ieee80211_channel *chan;
-	struct sta_info *sta;
-	u32 changed = 0;
-	bool erp_valid;
-	u8 erp_value = 0;
-	u32 ncrc;
-	u8 *bssid;
-	u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN];
-
-	sdata_assert_lock(sdata);
-
-	/* Process beacon from the current BSS */
-	baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
-	if (baselen > len)
-		return;
-
-	rcu_read_lock();
-	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
-	if (!chanctx_conf) {
-		rcu_read_unlock();
-		return;
-	}
-
-	if (rx_status->freq != chanctx_conf->def.chan->center_freq) {
-		rcu_read_unlock();
-		return;
-	}
-	chan = chanctx_conf->def.chan;
-	rcu_read_unlock();
-
-	if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon &&
-	    ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) {
-		ieee802_11_parse_elems(mgmt->u.beacon.variable,
-				       len - baselen, false, &elems);
-
-		ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
-		if (elems.tim && !elems.parse_error) {
-			const struct ieee80211_tim_ie *tim_ie = elems.tim;
-			ifmgd->dtim_period = tim_ie->dtim_period;
-		}
-		ifmgd->have_beacon = true;
-		ifmgd->assoc_data->need_beacon = false;
-		if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) {
-			sdata->vif.bss_conf.sync_tsf =
-				le64_to_cpu(mgmt->u.beacon.timestamp);
-			sdata->vif.bss_conf.sync_device_ts =
-				rx_status->device_timestamp;
-			if (elems.tim)
-				sdata->vif.bss_conf.sync_dtim_count =
-					elems.tim->dtim_count;
-			else
-				sdata->vif.bss_conf.sync_dtim_count = 0;
-		}
-		/* continue assoc process */
-		ifmgd->assoc_data->timeout = jiffies;
-		ifmgd->assoc_data->timeout_started = true;
-		run_again(sdata, ifmgd->assoc_data->timeout);
-		return;
-	}
-
-	if (!ifmgd->associated ||
-	    !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
-		return;
-	bssid = ifmgd->associated->bssid;
-
 	/* Track average RSSI from the Beacon frames of the current AP */
+
 	if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) {
 		ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE;
 		ewma_beacon_signal_init(&ifmgd->ave_beacon_signal);
@@ -3468,6 +3400,86 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 				sig, GFP_KERNEL);
 		}
 	}
+}
+
+static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
+				     struct ieee80211_mgmt *mgmt, size_t len,
+				     struct ieee80211_rx_status *rx_status)
+{
+	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+	struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+	size_t baselen;
+	struct ieee802_11_elems elems;
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_chanctx_conf *chanctx_conf;
+	struct ieee80211_channel *chan;
+	struct sta_info *sta;
+	u32 changed = 0;
+	bool erp_valid;
+	u8 erp_value = 0;
+	u32 ncrc;
+	u8 *bssid;
+	u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN];
+
+	sdata_assert_lock(sdata);
+
+	/* Process beacon from the current BSS */
+	baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
+	if (baselen > len)
+		return;
+
+	rcu_read_lock();
+	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	if (!chanctx_conf) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (rx_status->freq != chanctx_conf->def.chan->center_freq) {
+		rcu_read_unlock();
+		return;
+	}
+	chan = chanctx_conf->def.chan;
+	rcu_read_unlock();
+
+	if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon &&
+	    ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) {
+		ieee802_11_parse_elems(mgmt->u.beacon.variable,
+				       len - baselen, false, &elems);
+
+		ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
+		if (elems.tim && !elems.parse_error) {
+			const struct ieee80211_tim_ie *tim_ie = elems.tim;
+			ifmgd->dtim_period = tim_ie->dtim_period;
+		}
+		ifmgd->have_beacon = true;
+		ifmgd->assoc_data->need_beacon = false;
+		if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) {
+			sdata->vif.bss_conf.sync_tsf =
+				le64_to_cpu(mgmt->u.beacon.timestamp);
+			sdata->vif.bss_conf.sync_device_ts =
+				rx_status->device_timestamp;
+			if (elems.tim)
+				sdata->vif.bss_conf.sync_dtim_count =
+					elems.tim->dtim_count;
+			else
+				sdata->vif.bss_conf.sync_dtim_count = 0;
+		}
+		/* continue assoc process */
+		ifmgd->assoc_data->timeout = jiffies;
+		ifmgd->assoc_data->timeout_started = true;
+		run_again(sdata, ifmgd->assoc_data->timeout);
+		return;
+	}
+
+	if (!ifmgd->associated ||
+	    !ether_addr_equal(mgmt->bssid, ifmgd->associated->bssid))
+		return;
+	bssid = ifmgd->associated->bssid;
+
+	if (!(rx_status->flag & RX_FLAG_NO_SIGNAL_VAL))
+		ieee80211_handle_beacon_sig(sdata, ifmgd, bss_conf,
+					    local, rx_status);
 
 	if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) {
 		mlme_dbg_ratelimited(sdata,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 9c898a3688c6..27bb1f0b5e52 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2804,7 +2804,8 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
 	    !(rx->flags & IEEE80211_RX_BEACON_REPORTED)) {
 		int sig = 0;
 
-		if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM))
+		if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) &&
+		    !(status->flag & RX_FLAG_NO_SIGNAL_VAL))
 			sig = status->signal;
 
 		cfg80211_report_obss_beacon(rx->local->hw.wiphy,
@@ -3145,7 +3146,8 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx)
 	 * it transmitted were processed or returned.
 	 */
 
-	if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM))
+	if (ieee80211_hw_check(&rx->local->hw, SIGNAL_DBM) &&
+	    !(status->flag & RX_FLAG_NO_SIGNAL_VAL))
 		sig = status->signal;
 
 	if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig,
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index ef2becaade50..a3b1bcc2b461 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -73,7 +73,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
 	bool signal_valid;
 	struct ieee80211_sub_if_data *scan_sdata;
 
-	if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
+	if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL)
+		bss_meta.signal = 0; /* invalid signal indication */
+	else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM))
 		bss_meta.signal = rx_status->signal * 100;
 	else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC))
 		bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal;
-- 
cgit v1.2.3


From 2cb021f5de55b1d158fa18b0215a4613c3289a82 Mon Sep 17 00:00:00 2001
From: Dmitry Lebed <dlebed@quantenna.com>
Date: Thu, 1 Mar 2018 12:39:16 +0300
Subject: cfg80211/nl80211: add CAC_STARTED event

CAC_STARTED event is needed for DFS offload feature and
should be generated by driver/HW if DFS_OFFLOAD is enabled.

Signed-off-by: Dmitry Lebed <dlebed@quantenna.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/mlme.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index bbb9907bfa86..6b6818dd76bd 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -888,14 +888,17 @@ void cfg80211_cac_event(struct net_device *netdev,
 		       sizeof(struct cfg80211_chan_def));
 		queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk);
 		cfg80211_sched_dfs_chan_update(rdev);
-		break;
+		/* fall through */
 	case NL80211_RADAR_CAC_ABORTED:
+		wdev->cac_started = false;
+		break;
+	case NL80211_RADAR_CAC_STARTED:
+		wdev->cac_started = true;
 		break;
 	default:
 		WARN_ON(1);
 		return;
 	}
-	wdev->cac_started = false;
 
 	nl80211_radar_notify(rdev, chandef, event, netdev, gfp);
 }
-- 
cgit v1.2.3


From 13cf6dec93e021ebd297619ea1926aea31b6430b Mon Sep 17 00:00:00 2001
From: Dmitry Lebed <dlebed@quantenna.com>
Date: Thu, 1 Mar 2018 12:39:15 +0300
Subject: cfg80211/nl80211: add DFS offload flag

Add wiphy EXT_FEATURE flag to indicate that HW or driver does
all DFS actions by itself.
User-space functionality already implemented in hostapd using
vendor-specific (QCA) OUI to advertise DFS offload support.
Need to introduce generic flag to inform about DFS offload support.
For devices with DFS_OFFLOAD flag set user-space will no longer
need to issue CAC or do any actions in response to
"radar detected" events. HW will do everything by itself and send
events to user-space to indicate that CAC was started/finished, etc.

Signed-off-by: Dmitrii Lebed <dlebed@quantenna.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index a910150f8169..fe27ab443d01 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7551,12 +7551,13 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
 	struct cfg80211_registered_device *rdev = info->user_ptr[0];
 	struct net_device *dev = info->user_ptr[1];
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct wiphy *wiphy = wdev->wiphy;
 	struct cfg80211_chan_def chandef;
 	enum nl80211_dfs_regions dfs_region;
 	unsigned int cac_time_ms;
 	int err;
 
-	dfs_region = reg_get_dfs_region(wdev->wiphy);
+	dfs_region = reg_get_dfs_region(wiphy);
 	if (dfs_region == NL80211_DFS_UNSET)
 		return -EINVAL;
 
@@ -7570,17 +7571,20 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
 	if (wdev->cac_started)
 		return -EBUSY;
 
-	err = cfg80211_chandef_dfs_required(wdev->wiphy, &chandef,
-					    wdev->iftype);
+	err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype);
 	if (err < 0)
 		return err;
 
 	if (err == 0)
 		return -EINVAL;
 
-	if (!cfg80211_chandef_dfs_usable(wdev->wiphy, &chandef))
+	if (!cfg80211_chandef_dfs_usable(wiphy, &chandef))
 		return -EINVAL;
 
+	/* CAC start is offloaded to HW and can't be started manually */
+	if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD))
+		return -EOPNOTSUPP;
+
 	if (!rdev->ops->start_radar_detection)
 		return -EOPNOTSUPP;
 
-- 
cgit v1.2.3


From 1ae7762760736d4f7e4ea43e9ed03a608685c3d9 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Mar 2018 14:39:05 +0300
Subject: net: Convert can_pernet_ops

These pernet_operations create and destroy /proc entries
and cancel per-net timer.

Also, there are unneed iterations over empty list of net
devices, since all net devices must be already moved
to init_net or unregistered by default_device_ops. This
already was mentioned here:

https://marc.info/?l=linux-can&m=150169589119335&w=2

So, it looks safe to make them async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/can/af_can.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/can/af_can.c b/net/can/af_can.c
index 6da324550eec..e899970398a1 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -954,6 +954,7 @@ static struct notifier_block can_netdev_notifier __read_mostly = {
 static struct pernet_operations can_pernet_ops __read_mostly = {
 	.init = can_pernet_init,
 	.exit = can_pernet_exit,
+	.async = true,
 };
 
 static __init int can_init(void)
-- 
cgit v1.2.3


From 08012631d6277cebe388971cd84fe14d1cc49a00 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Mar 2018 14:45:37 +0300
Subject: net: Convert lowpan_frags_ops

These pernet_operations register and unregister sysctl.
Also, there is inet_frags_exit_net() called in exit method,
which has to be safe after a560002437d3 "net: Fix hlist
corruptions in inet_evict_bucket()".

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan/reassembly.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 85bf86ad6b18..a9ccb1322f69 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -603,6 +603,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 static struct pernet_operations lowpan_frags_ops = {
 	.init = lowpan_frags_init_net,
 	.exit = lowpan_frags_exit_net,
+	.async = true,
 };
 
 int __init lowpan_net_frag_init(void)
-- 
cgit v1.2.3


From aa65f636540539e2e1fd77bdcd8fc7060d19d47b Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 19 Mar 2018 14:45:46 +0300
Subject: net: Convert nf_ct_net_ops

These pernet_operations register and unregister sysctl.
Also, there is inet_frags_exit_net() called in exit method,
which has to be safe after a560002437d3 "net: Fix hlist
corruptions in inet_evict_bucket()".

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b84ce3e6d728..34136fe80ed5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -646,6 +646,7 @@ static void nf_ct_net_exit(struct net *net)
 static struct pernet_operations nf_ct_net_ops = {
 	.init = nf_ct_net_init,
 	.exit = nf_ct_net_exit,
+	.async = true,
 };
 
 int nf_ct_frag6_init(void)
-- 
cgit v1.2.3


From bdf5bd7f21323493dbe5f2c723dc33f2fbb0241a Mon Sep 17 00:00:00 2001
From: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Date: Mon, 19 Mar 2018 06:52:48 -0700
Subject: rds: tcp: remove register_netdevice_notifier infrastructure.

The netns deletion path does not need to wait for all net_devices
to be unregistered before dismantling rds_tcp state for the netns
(we are able to dismantle this state on module unload even when
all net_devices are active so there is no dependency here).

This patch removes code related to netdevice notifiers and
refactors all the code needed to dismantle rds_tcp state
into a ->exit callback for the pernet_operations used with
register_pernet_device().

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Reviewed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rds/tcp.c | 93 +++++++++++++++--------------------------------------------
 1 file changed, 23 insertions(+), 70 deletions(-)

(limited to 'net')

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 08ea9cd5c2f6..4f3a32c38bf5 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -485,40 +485,6 @@ fail:
 	return err;
 }
 
-static void __net_exit rds_tcp_exit_net(struct net *net)
-{
-	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
-
-	if (rtn->rds_tcp_sysctl)
-		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
-
-	if (net != &init_net && rtn->ctl_table)
-		kfree(rtn->ctl_table);
-
-	/* If rds_tcp_exit_net() is called as a result of netns deletion,
-	 * the rds_tcp_kill_sock() device notifier would already have cleaned
-	 * up the listen socket, thus there is no work to do in this function.
-	 *
-	 * If rds_tcp_exit_net() is called as a result of module unload,
-	 * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
-	 * we do need to clean up the listen socket here.
-	 */
-	if (rtn->rds_tcp_listen_sock) {
-		struct socket *lsock = rtn->rds_tcp_listen_sock;
-
-		rtn->rds_tcp_listen_sock = NULL;
-		rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
-	}
-}
-
-static struct pernet_operations rds_tcp_net_ops = {
-	.init = rds_tcp_init_net,
-	.exit = rds_tcp_exit_net,
-	.id = &rds_tcp_netid,
-	.size = sizeof(struct rds_tcp_net),
-	.async = true,
-};
-
 static void rds_tcp_kill_sock(struct net *net)
 {
 	struct rds_tcp_connection *tc, *_tc;
@@ -546,40 +512,38 @@ static void rds_tcp_kill_sock(struct net *net)
 		rds_conn_destroy(tc->t_cpath->cp_conn);
 }
 
-void *rds_tcp_listen_sock_def_readable(struct net *net)
+static void __net_exit rds_tcp_exit_net(struct net *net)
 {
 	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
-	struct socket *lsock = rtn->rds_tcp_listen_sock;
 
-	if (!lsock)
-		return NULL;
+	rds_tcp_kill_sock(net);
 
-	return lsock->sk->sk_user_data;
+	if (rtn->rds_tcp_sysctl)
+		unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+
+	if (net != &init_net && rtn->ctl_table)
+		kfree(rtn->ctl_table);
 }
 
-static int rds_tcp_dev_event(struct notifier_block *this,
-			     unsigned long event, void *ptr)
+static struct pernet_operations rds_tcp_net_ops = {
+	.init = rds_tcp_init_net,
+	.exit = rds_tcp_exit_net,
+	.id = &rds_tcp_netid,
+	.size = sizeof(struct rds_tcp_net),
+	.async = true,
+};
+
+void *rds_tcp_listen_sock_def_readable(struct net *net)
 {
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+	struct socket *lsock = rtn->rds_tcp_listen_sock;
 
-	/* rds-tcp registers as a pernet subys, so the ->exit will only
-	 * get invoked after network acitivity has quiesced. We need to
-	 * clean up all sockets  to quiesce network activity, and use
-	 * the unregistration of the per-net loopback device as a trigger
-	 * to start that cleanup.
-	 */
-	if (event == NETDEV_UNREGISTER_FINAL &&
-	    dev->ifindex == LOOPBACK_IFINDEX)
-		rds_tcp_kill_sock(dev_net(dev));
+	if (!lsock)
+		return NULL;
 
-	return NOTIFY_DONE;
+	return lsock->sk->sk_user_data;
 }
 
-static struct notifier_block rds_tcp_dev_notifier = {
-	.notifier_call        = rds_tcp_dev_event,
-	.priority = -10, /* must be called after other network notifiers */
-};
-
 /* when sysctl is used to modify some kernel socket parameters,this
  * function  resets the RDS connections in that netns  so that we can
  * restart with new parameters.  The assumption is that such reset
@@ -625,9 +589,7 @@ static void rds_tcp_exit(void)
 	rds_tcp_set_unloading();
 	synchronize_rcu();
 	rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-	unregister_pernet_subsys(&rds_tcp_net_ops);
-	if (unregister_netdevice_notifier(&rds_tcp_dev_notifier))
-		pr_warn("could not unregister rds_tcp_dev_notifier\n");
+	unregister_pernet_device(&rds_tcp_net_ops);
 	rds_tcp_destroy_conns();
 	rds_trans_unregister(&rds_tcp_transport);
 	rds_tcp_recv_exit();
@@ -651,24 +613,15 @@ static int rds_tcp_init(void)
 	if (ret)
 		goto out_slab;
 
-	ret = register_pernet_subsys(&rds_tcp_net_ops);
+	ret = register_pernet_device(&rds_tcp_net_ops);
 	if (ret)
 		goto out_recv;
 
-	ret = register_netdevice_notifier(&rds_tcp_dev_notifier);
-	if (ret) {
-		pr_warn("could not register rds_tcp_dev_notifier\n");
-		goto out_pernet;
-	}
-
 	rds_trans_register(&rds_tcp_transport);
 
 	rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
 
 	goto out;
-
-out_pernet:
-	unregister_pernet_subsys(&rds_tcp_net_ops);
 out_recv:
 	rds_tcp_recv_exit();
 out_slab:
-- 
cgit v1.2.3


From 145307460ba9c11489807de7acd3f4c7395f60b7 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 20 Mar 2018 19:31:14 -0700
Subject: devlink: Remove top_hierarchy arg to devlink_resource_register

top_hierarchy arg can be determined by comparing parent_resource_id to
DEVLINK_RESOURCE_ID_PARENT_TOP so it does not need to be a separate
argument.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/devlink.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/devlink.c b/net/core/devlink.c
index f23e5ed7c90f..d03b96f87c25 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -3174,7 +3174,6 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
  */
 int devlink_resource_register(struct devlink *devlink,
 			      const char *resource_name,
-			      bool top_hierarchy,
 			      u64 resource_size,
 			      u64 resource_id,
 			      u64 parent_resource_id,
@@ -3183,8 +3182,11 @@ int devlink_resource_register(struct devlink *devlink,
 {
 	struct devlink_resource *resource;
 	struct list_head *resource_list;
+	bool top_hierarchy;
 	int err = 0;
 
+	top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
+
 	mutex_lock(&devlink->lock);
 	resource = devlink_resource_find(devlink, NULL, resource_id);
 	if (resource) {
-- 
cgit v1.2.3


From dfde331e757fd792e1c9579b72a8370ca665e5ed Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Wed, 21 Mar 2018 14:37:43 +0100
Subject: tipc: modify socket iterator for sock_diag

The current socket iterator function tipc_nl_sk_dump, handles socket
locks and calls __tipc_nl_add_sk for each socket.
To reuse this logic in sock_diag implementation, we do minor
modifications to make these functions generic as described below.

In this commit, we add a two new functions __tipc_nl_sk_walk,
__tipc_nl_add_sk_info and modify tipc_nl_sk_dump, __tipc_nl_add_sk
accordingly.

In __tipc_nl_sk_walk we:
1. acquire and release socket locks
2. for each socket, execute the specified callback function

In __tipc_nl_add_sk we:
- Move the netlink attribute insertion to __tipc_nl_add_sk_info.

tipc_nl_sk_dump calls tipc_nl_sk_walk with __tipc_nl_add_sk as argument.

sock_diag will use these generic functions in a later commit.

There is no functional change in this commit.
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 65 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 41 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index a4a9148d4629..35ac0f5b9529 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3160,16 +3160,33 @@ msg_full:
 	return -EMSGSIZE;
 }
 
+static int __tipc_nl_add_sk_info(struct sk_buff *skb, struct tipc_sock
+			  *tsk)
+{
+	struct net *net = sock_net(skb->sk);
+	struct tipc_net *tn = tipc_net(net);
+	struct sock *sk = &tsk->sk;
+
+	if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr))
+		return -EMSGSIZE;
+
+	if (tipc_sk_connected(sk)) {
+		if (__tipc_nl_add_sk_con(skb, tsk))
+			return -EMSGSIZE;
+	} else if (!list_empty(&tsk->publications)) {
+		if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL))
+			return -EMSGSIZE;
+	}
+	return 0;
+}
+
 /* Caller should hold socket lock for the passed tipc socket. */
 static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
 			    struct tipc_sock *tsk)
 {
-	int err;
-	void *hdr;
 	struct nlattr *attrs;
-	struct net *net = sock_net(skb->sk);
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct sock *sk = &tsk->sk;
+	void *hdr;
 
 	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
 			  &tipc_genl_family, NLM_F_MULTI, TIPC_NL_SOCK_GET);
@@ -3179,19 +3196,10 @@ static int __tipc_nl_add_sk(struct sk_buff *skb, struct netlink_callback *cb,
 	attrs = nla_nest_start(skb, TIPC_NLA_SOCK);
 	if (!attrs)
 		goto genlmsg_cancel;
-	if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid))
-		goto attr_msg_cancel;
-	if (nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr))
+
+	if (__tipc_nl_add_sk_info(skb, tsk))
 		goto attr_msg_cancel;
 
-	if (tipc_sk_connected(sk)) {
-		err = __tipc_nl_add_sk_con(skb, tsk);
-		if (err)
-			goto attr_msg_cancel;
-	} else if (!list_empty(&tsk->publications)) {
-		if (nla_put_flag(skb, TIPC_NLA_SOCK_HAS_PUBL))
-			goto attr_msg_cancel;
-	}
 	nla_nest_end(skb, attrs);
 	genlmsg_end(skb, hdr);
 
@@ -3205,16 +3213,19 @@ msg_cancel:
 	return -EMSGSIZE;
 }
 
-int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static int __tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
+			     int (*skb_handler)(struct sk_buff *skb,
+						struct netlink_callback *cb,
+						struct tipc_sock *tsk))
 {
-	int err;
-	struct tipc_sock *tsk;
-	const struct bucket_table *tbl;
-	struct rhash_head *pos;
 	struct net *net = sock_net(skb->sk);
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	u32 tbl_id = cb->args[0];
+	struct tipc_net *tn = tipc_net(net);
+	const struct bucket_table *tbl;
 	u32 prev_portid = cb->args[1];
+	u32 tbl_id = cb->args[0];
+	struct rhash_head *pos;
+	struct tipc_sock *tsk;
+	int err;
 
 	rcu_read_lock();
 	tbl = rht_dereference_rcu((&tn->sk_rht)->tbl, &tn->sk_rht);
@@ -3226,12 +3237,13 @@ int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb)
 				continue;
 			}
 
-			err = __tipc_nl_add_sk(skb, cb, tsk);
+			err = skb_handler(skb, cb, tsk);
 			if (err) {
 				prev_portid = tsk->portid;
 				spin_unlock_bh(&tsk->sk.sk_lock.slock);
 				goto out;
 			}
+
 			prev_portid = 0;
 			spin_unlock_bh(&tsk->sk.sk_lock.slock);
 		}
@@ -3244,6 +3256,11 @@ out:
 	return skb->len;
 }
 
+int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return __tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk);
+}
+
 /* Caller should hold socket lock for the passed tipc socket. */
 static int __tipc_nl_add_sk_publ(struct sk_buff *skb,
 				 struct netlink_callback *cb,
-- 
cgit v1.2.3


From c30b70deb5f4861f590031c33fd3ec6cc63f1df1 Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Wed, 21 Mar 2018 14:37:44 +0100
Subject: tipc: implement socket diagnostics for AF_TIPC

This commit adds socket diagnostics capability for AF_TIPC in netlink
family NETLINK_SOCK_DIAG in a new kernel module (diag.ko).

The following are key design considerations:
- config TIPC_DIAG has default y, like INET_DIAG.
- only requests with flag NLM_F_DUMP is supported (dump all).
- tipc_sock_diag_req message is introduced to send filter parameters.
- the response attributes are of TLV, some nested.

To avoid exposing data structures between diag and tipc modules and
avoid code duplication, the following additions are required:
- export tipc_nl_sk_walk function to reuse socket iterator.
- export tipc_sk_fill_sock_diag to fill the tipc diag attributes.
- create a sock_diag response message in __tipc_add_sock_diag defined
  in diag.c and use the above exported tipc_sk_fill_sock_diag
  to fill response.

Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/Kconfig  |   8 ++++
 net/tipc/Makefile |   5 +++
 net/tipc/diag.c   | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/tipc/socket.c |  72 +++++++++++++++++++++++++++++++---
 net/tipc/socket.h |  10 ++++-
 5 files changed, 203 insertions(+), 6 deletions(-)
 create mode 100644 net/tipc/diag.c

(limited to 'net')

diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig
index c25a3a149dc4..e450212121d2 100644
--- a/net/tipc/Kconfig
+++ b/net/tipc/Kconfig
@@ -34,3 +34,11 @@ config TIPC_MEDIA_UDP
 	  Saying Y here will enable support for running TIPC over IP/UDP
 	bool
 	default y
+
+config TIPC_DIAG
+	tristate "TIPC: socket monitoring interface"
+	depends on TIPC
+	default y
+	---help---
+	Support for TIPC socket monitoring interface used by ss tool.
+	If unsure, say Y.
diff --git a/net/tipc/Makefile b/net/tipc/Makefile
index 1edb7192aa2f..aca168f2abb1 100644
--- a/net/tipc/Makefile
+++ b/net/tipc/Makefile
@@ -14,3 +14,8 @@ tipc-y	+= addr.o bcast.o bearer.o \
 tipc-$(CONFIG_TIPC_MEDIA_UDP)	+= udp_media.o
 tipc-$(CONFIG_TIPC_MEDIA_IB)	+= ib_media.o
 tipc-$(CONFIG_SYSCTL)		+= sysctl.o
+
+
+obj-$(CONFIG_TIPC_DIAG)	+= diag.o
+
+tipc_diag-y	:= diag.o
diff --git a/net/tipc/diag.c b/net/tipc/diag.c
new file mode 100644
index 000000000000..46d9cd62f781
--- /dev/null
+++ b/net/tipc/diag.c
@@ -0,0 +1,114 @@
+/*
+ * net/tipc/diag.c: TIPC socket diag
+ *
+ * Copyright (c) 2018, Ericsson AB
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the names of the copyright holders nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * Alternatively, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") version 2 as published by the Free
+ * Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "core.h"
+#include "socket.h"
+#include <linux/sock_diag.h>
+#include <linux/tipc_sockets_diag.h>
+
+static u64 __tipc_diag_gen_cookie(struct sock *sk)
+{
+	u32 res[2];
+
+	sock_diag_save_cookie(sk, res);
+	return *((u64 *)res);
+}
+
+static int __tipc_add_sock_diag(struct sk_buff *skb,
+				struct netlink_callback *cb,
+				struct tipc_sock *tsk)
+{
+	struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh);
+	struct nlmsghdr *nlh;
+	int err;
+
+	nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0,
+			       NLM_F_MULTI);
+	if (!nlh)
+		return -EMSGSIZE;
+
+	err = tipc_sk_fill_sock_diag(skb, tsk, req->tidiag_states,
+				     __tipc_diag_gen_cookie);
+	if (err)
+		return err;
+
+	nlmsg_end(skb, nlh);
+	return 0;
+}
+
+static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag);
+}
+
+static int tipc_sock_diag_handler_dump(struct sk_buff *skb,
+				       struct nlmsghdr *h)
+{
+	int hdrlen = sizeof(struct tipc_sock_diag_req);
+	struct net *net = sock_net(skb->sk);
+
+	if (nlmsg_len(h) < hdrlen)
+		return -EINVAL;
+
+	if (h->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.dump = tipc_diag_dump,
+		};
+		netlink_dump_start(net->diag_nlsk, skb, h, &c);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static const struct sock_diag_handler tipc_sock_diag_handler = {
+	.family = AF_TIPC,
+	.dump = tipc_sock_diag_handler_dump,
+};
+
+static int __init tipc_diag_init(void)
+{
+	return sock_diag_register(&tipc_sock_diag_handler);
+}
+
+static void __exit tipc_diag_exit(void)
+{
+	sock_diag_unregister(&tipc_sock_diag_handler);
+}
+
+module_init(tipc_diag_init);
+module_exit(tipc_diag_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 35ac0f5b9529..07559ce4b8ba 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -3213,10 +3213,10 @@ msg_cancel:
 	return -EMSGSIZE;
 }
 
-static int __tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
-			     int (*skb_handler)(struct sk_buff *skb,
-						struct netlink_callback *cb,
-						struct tipc_sock *tsk))
+int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
+		    int (*skb_handler)(struct sk_buff *skb,
+				       struct netlink_callback *cb,
+				       struct tipc_sock *tsk))
 {
 	struct net *net = sock_net(skb->sk);
 	struct tipc_net *tn = tipc_net(net);
@@ -3255,10 +3255,72 @@ out:
 
 	return skb->len;
 }
+EXPORT_SYMBOL(tipc_nl_sk_walk);
+
+int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk,
+			   u32 sk_filter_state,
+			   u64 (*tipc_diag_gen_cookie)(struct sock *sk))
+{
+	struct sock *sk = &tsk->sk;
+	struct nlattr *attrs;
+	struct nlattr *stat;
+
+	/*filter response w.r.t sk_state*/
+	if (!(sk_filter_state & (1 << sk->sk_state)))
+		return 0;
+
+	attrs = nla_nest_start(skb, TIPC_NLA_SOCK);
+	if (!attrs)
+		goto msg_cancel;
+
+	if (__tipc_nl_add_sk_info(skb, tsk))
+		goto attr_msg_cancel;
+
+	if (nla_put_u32(skb, TIPC_NLA_SOCK_TYPE, (u32)sk->sk_type) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_UID,
+			from_kuid_munged(sk_user_ns(sk), sock_i_uid(sk))) ||
+	    nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE,
+			      tipc_diag_gen_cookie(sk),
+			      TIPC_NLA_SOCK_PAD))
+		goto attr_msg_cancel;
+
+	stat = nla_nest_start(skb, TIPC_NLA_SOCK_STAT);
+	if (!stat)
+		goto attr_msg_cancel;
+
+	if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ,
+			skb_queue_len(&sk->sk_receive_queue)) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ,
+			skb_queue_len(&sk->sk_write_queue)))
+		goto stat_msg_cancel;
+
+	if (tsk->cong_link_cnt &&
+	    nla_put_flag(skb, TIPC_NLA_SOCK_STAT_LINK_CONG))
+		goto stat_msg_cancel;
+
+	if (tsk_conn_cong(tsk) &&
+	    nla_put_flag(skb, TIPC_NLA_SOCK_STAT_CONN_CONG))
+		goto stat_msg_cancel;
+
+	nla_nest_end(skb, stat);
+	nla_nest_end(skb, attrs);
+
+	return 0;
+
+stat_msg_cancel:
+	nla_nest_cancel(skb, stat);
+attr_msg_cancel:
+	nla_nest_cancel(skb, attrs);
+msg_cancel:
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(tipc_sk_fill_sock_diag);
 
 int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	return __tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk);
+	return tipc_nl_sk_walk(skb, cb, __tipc_nl_add_sk);
 }
 
 /* Caller should hold socket lock for the passed tipc socket. */
diff --git a/net/tipc/socket.h b/net/tipc/socket.h
index 06fb5944cf76..aae3fd4cd06c 100644
--- a/net/tipc/socket.h
+++ b/net/tipc/socket.h
@@ -49,6 +49,8 @@
 #define RCVBUF_DEF  (FLOWCTL_BLK_SZ * 1024 * 2)
 #define RCVBUF_MAX  (FLOWCTL_BLK_SZ * 1024 * 16)
 
+struct tipc_sock;
+
 int tipc_socket_init(void);
 void tipc_socket_stop(void);
 void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq);
@@ -59,5 +61,11 @@ int tipc_sk_rht_init(struct net *net);
 void tipc_sk_rht_destroy(struct net *net);
 int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb);
 int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb);
-
+int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk,
+			   u32 sk_filter_state,
+			   u64 (*tipc_diag_gen_cookie)(struct sock *sk));
+int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb,
+		    int (*skb_handler)(struct sk_buff *skb,
+				       struct netlink_callback *cb,
+				       struct tipc_sock *tsk));
 #endif
-- 
cgit v1.2.3


From 872619d8cf810c17279335ef531a2a34f3b4e589 Mon Sep 17 00:00:00 2001
From: GhantaKrishnamurthy MohanKrishna
 <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Date: Wed, 21 Mar 2018 14:37:45 +0100
Subject: tipc: step sk->sk_drops when rcv buffer is full

Currently when tipc is unable to queue a received message on a
socket, the message is rejected back to the sender with error
TIPC_ERR_OVERLOAD. However, the application on this socket
has no knowledge about these discards.

In this commit, we try to step the sk_drops counter when tipc
is unable to queue a received message. Export sk_drops
using tipc socket diagnostics.

Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: GhantaKrishnamurthy MohanKrishna <mohan.krishna.ghanta.krishnamurthy@ericsson.com>
Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 07559ce4b8ba..732ec894f69f 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2122,8 +2122,10 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
 		    (!sk_conn && msg_connected(hdr)) ||
 		    (!grp && msg_in_group(hdr)))
 			err = TIPC_ERR_NO_PORT;
-		else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit)
+		else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) {
+			atomic_inc(&sk->sk_drops);
 			err = TIPC_ERR_OVERLOAD;
+		}
 
 		if (unlikely(err)) {
 			tipc_skb_reject(net, err, skb, xmitq);
@@ -2202,6 +2204,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
 
 		/* Overload => reject message back to sender */
 		onode = tipc_own_addr(sock_net(sk));
+		atomic_inc(&sk->sk_drops);
 		if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD))
 			__skb_queue_tail(xmitq, skb);
 		break;
@@ -3293,7 +3296,9 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk,
 	if (nla_put_u32(skb, TIPC_NLA_SOCK_STAT_RCVQ,
 			skb_queue_len(&sk->sk_receive_queue)) ||
 	    nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ,
-			skb_queue_len(&sk->sk_write_queue)))
+			skb_queue_len(&sk->sk_write_queue)) ||
+	    nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP,
+			atomic_read(&sk->sk_drops)))
 		goto stat_msg_cancel;
 
 	if (tsk->cong_link_cnt &&
-- 
cgit v1.2.3


From 1574639411b3ce311b846cae7fda56bf1af7d027 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 21 Mar 2018 19:34:58 +0000
Subject: gre: fix TUNNEL_SEQ bit check on sequence numbering

The current logic of flags | TUNNEL_SEQ is always non-zero and hence
sequence numbers are always incremented no matter the setting of the
TUNNEL_SEQ bit.  Fix this by using & instead of |.

Detected by CoverityScan, CID#1466039 ("Operands don't affect result")

Fixes: 77a5196a804e ("gre: add sequence number for collect md mode.")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: William Tu <u9012063@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c  | 2 +-
 net/ipv6/ip6_gre.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 2fa2ef2e2af9..9ab1aa2f7660 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -550,7 +550,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
 	gre_build_header(skb, tunnel_hlen, flags, proto,
 			 tunnel_id_to_key32(tun_info->key.tun_id),
-			 (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
 
 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
 
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 7d8775c9570d..6adbcf40cf8c 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -724,7 +724,7 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		gre_build_header(skb, tunnel->tun_hlen,
 				 flags, protocol,
 				 tunnel_id_to_key32(tun_info->key.tun_id),
-				 (flags | TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
+				 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
 						      : 0);
 
 	} else {
-- 
cgit v1.2.3


From 76d3e153d0d16032ab68e3b698c4df05acef48c5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 22 Mar 2018 12:45:02 +0300
Subject: net: Revert "ipv4: get rid of ip_ra_lock"

This reverts commit ba3f571d5dde. The commit was made
after 1215e51edad1 "ipv4: fix a deadlock in ip_ra_control",
and killed ip_ra_lock, which became useless after rtnl_lock()
made used to destroy every raw ipv4 socket. This scales
very bad, and next patch in series reverts 1215e51edad1.
ip_ra_lock will be used again.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 74c962b9b09c..be7c3b71914d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -334,6 +334,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
    sent to multicast group to reach destination designated router.
  */
 struct ip_ra_chain __rcu *ip_ra_chain;
+static DEFINE_SPINLOCK(ip_ra_lock);
 
 
 static void ip_ra_destroy_rcu(struct rcu_head *head)
@@ -355,17 +356,21 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
+	spin_lock_bh(&ip_ra_lock);
 	for (rap = &ip_ra_chain;
-	     (ra = rtnl_dereference(*rap)) != NULL;
+	     (ra = rcu_dereference_protected(*rap,
+			lockdep_is_held(&ip_ra_lock))) != NULL;
 	     rap = &ra->next) {
 		if (ra->sk == sk) {
 			if (on) {
+				spin_unlock_bh(&ip_ra_lock);
 				kfree(new_ra);
 				return -EADDRINUSE;
 			}
 			/* dont let ip_call_ra_chain() use sk again */
 			ra->sk = NULL;
 			RCU_INIT_POINTER(*rap, ra->next);
+			spin_unlock_bh(&ip_ra_lock);
 
 			if (ra->destructor)
 				ra->destructor(sk);
@@ -379,14 +384,17 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 			return 0;
 		}
 	}
-	if (!new_ra)
+	if (!new_ra) {
+		spin_unlock_bh(&ip_ra_lock);
 		return -ENOBUFS;
+	}
 	new_ra->sk = sk;
 	new_ra->destructor = destructor;
 
 	RCU_INIT_POINTER(new_ra->next, ra);
 	rcu_assign_pointer(*rap, new_ra);
 	sock_hold(sk);
+	spin_unlock_bh(&ip_ra_lock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 0526947f9dd019da8d550ed20177585a84b3460e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 22 Mar 2018 12:45:12 +0300
Subject: net: Move IP_ROUTER_ALERT out of lock_sock(sk)

ip_ra_control() does not need sk_lock. Who are the another
users of ip_ra_chain? ip_mroute_setsockopt() doesn't take
sk_lock, while parallel IP_ROUTER_ALERT syscalls are
synchronized by ip_ra_lock. So, we may move this command
out of sk_lock.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index be7c3b71914d..dcbf6afe27e7 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -647,6 +647,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
 
 	/* If optlen==0, it is equivalent to val == 0 */
 
+	if (optname == IP_ROUTER_ALERT)
+		return ip_ra_control(sk, val ? 1 : 0, NULL);
 	if (ip_mroute_opt(optname))
 		return ip_mroute_setsockopt(sk, optname, optval, optlen);
 
@@ -1157,9 +1159,6 @@ mc_msf_out:
 			goto e_inval;
 		inet->mc_all = val;
 		break;
-	case IP_ROUTER_ALERT:
-		err = ip_ra_control(sk, val ? 1 : 0, NULL);
-		break;
 
 	case IP_FREEBIND:
 		if (optlen < 1)
-- 
cgit v1.2.3


From 128aaa98ad1422eacc4c74879840cb3e579cd934 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 22 Mar 2018 12:45:22 +0300
Subject: net: Revert "ipv4: fix a deadlock in ip_ra_control"

This reverts commit 1215e51edad1.
Since raw_close() is used on every RAW socket destruction,
the changes made by 1215e51edad1 scale sadly. This clearly
seen on endless unshare(CLONE_NEWNET) test, and cleanup_net()
kwork spends a lot of time waiting for rtnl_lock() introduced
by this commit.

Previous patch moved IP_ROUTER_ALERT out of rtnl_lock(),
so we revert this patch.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_sockglue.c |  1 -
 net/ipv4/ipmr.c        | 11 +++++++++--
 net/ipv4/raw.c         |  2 --
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index dcbf6afe27e7..bf5f44b27b7e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -594,7 +594,6 @@ static bool setsockopt_needs_rtnl(int optname)
 	case MCAST_LEAVE_GROUP:
 	case MCAST_LEAVE_SOURCE_GROUP:
 	case MCAST_UNBLOCK_SOURCE:
-	case IP_ROUTER_ALERT:
 		return true;
 	}
 	return false;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index d752a70855d8..f6be5db16da2 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1399,7 +1399,7 @@ static void mrtsock_destruct(struct sock *sk)
 	struct net *net = sock_net(sk);
 	struct mr_table *mrt;
 
-	ASSERT_RTNL();
+	rtnl_lock();
 	ipmr_for_each_table(mrt, net) {
 		if (sk == rtnl_dereference(mrt->mroute_sk)) {
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
@@ -1411,6 +1411,7 @@ static void mrtsock_destruct(struct sock *sk)
 			mroute_clean_tables(mrt, false);
 		}
 	}
+	rtnl_unlock();
 }
 
 /* Socket options and virtual interface manipulation. The whole
@@ -1475,8 +1476,13 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 		if (sk != rcu_access_pointer(mrt->mroute_sk)) {
 			ret = -EACCES;
 		} else {
+			/* We need to unlock here because mrtsock_destruct takes
+			 * care of rtnl itself and we can't change that due to
+			 * the IP_ROUTER_ALERT setsockopt which runs without it.
+			 */
+			rtnl_unlock();
 			ret = ip_ra_control(sk, 0, NULL);
-			goto out_unlock;
+			goto out;
 		}
 		break;
 	case MRT_ADD_VIF:
@@ -1588,6 +1594,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
 	}
 out_unlock:
 	rtnl_unlock();
+out:
 	return ret;
 }
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 54648d20bf0f..720bef7da2f6 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -711,9 +711,7 @@ static void raw_close(struct sock *sk, long timeout)
 	/*
 	 * Raw sockets may have direct kernel references. Kill them.
 	 */
-	rtnl_lock();
 	ip_ra_control(sk, 0, NULL);
-	rtnl_unlock();
 
 	sk_common_release(sk);
 }
-- 
cgit v1.2.3


From 5796ef75ec7b6019eac88f66751d663d537a5cd3 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 22 Mar 2018 12:45:32 +0300
Subject: net: Make ip_ra_chain per struct net

This is optimization, which makes ip_call_ra_chain()
iterate less sockets to find the sockets it's looking for.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_input.c    |  5 ++---
 net/ipv4/ip_sockglue.c | 15 ++-------------
 2 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 57fc13c6ab2b..7582713dd18f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -159,7 +159,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 	struct net_device *dev = skb->dev;
 	struct net *net = dev_net(dev);
 
-	for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
+	for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
 		struct sock *sk = ra->sk;
 
 		/* If socket is bound to an interface, only report
@@ -167,8 +167,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
 		 */
 		if (sk && inet_sk(sk)->inet_num == protocol &&
 		    (!sk->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == dev->ifindex) &&
-		    net_eq(sock_net(sk), net)) {
+		     sk->sk_bound_dev_if == dev->ifindex)) {
 			if (ip_is_fragment(ip_hdr(skb))) {
 				if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
 					return true;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index bf5f44b27b7e..f36d35fe924b 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,18 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 	return 0;
 }
 
-
-/* Special input handler for packets caught by router alert option.
-   They are selected only by protocol field, and then processed likely
-   local ones; but only if someone wants them! Otherwise, router
-   not running rsvpd will kill RSVP.
-
-   It is user level problem, what it will make with them.
-   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
-   but receiver should be enough clever f.e. to forward mtrace requests,
-   sent to multicast group to reach destination designated router.
- */
-struct ip_ra_chain __rcu *ip_ra_chain;
 static DEFINE_SPINLOCK(ip_ra_lock);
 
 
@@ -350,6 +338,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 {
 	struct ip_ra_chain *ra, *new_ra;
 	struct ip_ra_chain __rcu **rap;
+	struct net *net = sock_net(sk);
 
 	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
 		return -EINVAL;
@@ -357,7 +346,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
 	spin_lock_bh(&ip_ra_lock);
-	for (rap = &ip_ra_chain;
+	for (rap = &net->ipv4.ra_chain;
 	     (ra = rcu_dereference_protected(*rap,
 			lockdep_is_held(&ip_ra_lock))) != NULL;
 	     rap = &ra->next) {
-- 
cgit v1.2.3


From d9ff3049739e349b5380b96226f9ad766741773d Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 22 Mar 2018 12:45:40 +0300
Subject: net: Replace ip_ra_lock with per-net mutex

Since ra_chain is per-net, we may use per-net mutexes
to protect them in ip_ra_control(). This improves
scalability.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c |  1 +
 net/ipv4/ip_sockglue.c   | 15 ++++++---------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index c340d5cfbdec..95ba2c53bd9a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -301,6 +301,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	net->user_ns = user_ns;
 	idr_init(&net->netns_ids);
 	spin_lock_init(&net->nsid_lock);
+	mutex_init(&net->ipv4.ra_mutex);
 
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index f36d35fe924b..5ad2d8ed3a3f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -322,9 +322,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 	return 0;
 }
 
-static DEFINE_SPINLOCK(ip_ra_lock);
-
-
 static void ip_ra_destroy_rcu(struct rcu_head *head)
 {
 	struct ip_ra_chain *ra = container_of(head, struct ip_ra_chain, rcu);
@@ -345,21 +342,21 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 
 	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
 
-	spin_lock_bh(&ip_ra_lock);
+	mutex_lock(&net->ipv4.ra_mutex);
 	for (rap = &net->ipv4.ra_chain;
 	     (ra = rcu_dereference_protected(*rap,
-			lockdep_is_held(&ip_ra_lock))) != NULL;
+			lockdep_is_held(&net->ipv4.ra_mutex))) != NULL;
 	     rap = &ra->next) {
 		if (ra->sk == sk) {
 			if (on) {
-				spin_unlock_bh(&ip_ra_lock);
+				mutex_unlock(&net->ipv4.ra_mutex);
 				kfree(new_ra);
 				return -EADDRINUSE;
 			}
 			/* dont let ip_call_ra_chain() use sk again */
 			ra->sk = NULL;
 			RCU_INIT_POINTER(*rap, ra->next);
-			spin_unlock_bh(&ip_ra_lock);
+			mutex_unlock(&net->ipv4.ra_mutex);
 
 			if (ra->destructor)
 				ra->destructor(sk);
@@ -374,7 +371,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 		}
 	}
 	if (!new_ra) {
-		spin_unlock_bh(&ip_ra_lock);
+		mutex_unlock(&net->ipv4.ra_mutex);
 		return -ENOBUFS;
 	}
 	new_ra->sk = sk;
@@ -383,7 +380,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
 	RCU_INIT_POINTER(new_ra->next, ra);
 	rcu_assign_pointer(*rap, new_ra);
 	sock_hold(sk);
-	spin_unlock_bh(&ip_ra_lock);
+	mutex_unlock(&net->ipv4.ra_mutex);
 
 	return 0;
 }
-- 
cgit v1.2.3


From dcbe73ca55a42712bfd0e9966cd2d5a48355ace3 Mon Sep 17 00:00:00 2001
From: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
Date: Thu, 22 Mar 2018 12:18:03 -0700
Subject: mac80211: notify driver for change in multicast rates

With drivers implementing rate control in driver or firmware
rate_control_send_low() may not get called, and thus the
driver needs to know about changes in the multicast rate.

Add and use a new BSS change flag for this.

Signed-off-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org>
[rewrite commit message]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c  | 2 ++
 net/mac80211/ibss.c | 2 +-
 net/mac80211/mesh.c | 3 ++-
 net/mac80211/util.c | 3 ++-
 4 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index fd68f6fb02d7..5c4b105ca398 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -2313,6 +2313,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev,
 	memcpy(sdata->vif.bss_conf.mcast_rate, rate,
 	       sizeof(int) * NUM_NL80211_BANDS);
 
+	ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MCAST_RATE);
+
 	return 0;
 }
 
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index db07e0de9a03..dc582aa35c89 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1839,7 +1839,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 		  IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED
 		| IEEE80211_HT_PARAM_RIFS_MODE;
 
-	changed |= BSS_CHANGED_HT;
+	changed |= BSS_CHANGED_HT | BSS_CHANGED_MCAST_RATE;
 	ieee80211_bss_info_change_notify(sdata, changed);
 
 	sdata->smps_mode = IEEE80211_SMPS_OFF;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 6a381cbe1e33..d51da26e9c18 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -880,7 +880,8 @@ int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata)
 		      BSS_CHANGED_BEACON_ENABLED |
 		      BSS_CHANGED_HT |
 		      BSS_CHANGED_BASIC_RATES |
-		      BSS_CHANGED_BEACON_INT;
+		      BSS_CHANGED_BEACON_INT |
+		      BSS_CHANGED_MCAST_RATE;
 
 	local->fif_other_bss++;
 	/* mesh ifaces must set allmulti to forward mcast traffic */
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 1f82191ce601..55cd2922627a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1968,7 +1968,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
 			  BSS_CHANGED_CQM |
 			  BSS_CHANGED_QOS |
 			  BSS_CHANGED_IDLE |
-			  BSS_CHANGED_TXPOWER;
+			  BSS_CHANGED_TXPOWER |
+			  BSS_CHANGED_MCAST_RATE;
 
 		if (sdata->vif.mu_mimo_owner)
 			changed |= BSS_CHANGED_MU_GROUPS;
-- 
cgit v1.2.3


From 419d14af9e07fb5ca32b1b1614793c6b1e242152 Mon Sep 17 00:00:00 2001
From: Chas Williams <3chas3@gmail.com>
Date: Thu, 22 Mar 2018 11:34:06 -0400
Subject: bridge: Allow max MTU when multiple VLANs present

If the bridge is allowing multiple VLANs, some VLANs may have
different MTUs.  Instead of choosing the minimum MTU for the
bridge interface, choose the maximum MTU of the bridge members.
With this the user only needs to set a larger MTU on the member
ports that are participating in the large MTU VLANS.

Signed-off-by: Chas Williams <3chas3@gmail.com>
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c         |  2 +-
 net/bridge/br_device.c  |  2 +-
 net/bridge/br_if.c      | 26 ++++++++++++++++++++++----
 net/bridge/br_private.h |  2 +-
 4 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br.c b/net/bridge/br.c
index 7770481a6506..a3f95ab9d6a3 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -52,7 +52,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	switch (event) {
 	case NETDEV_CHANGEMTU:
-		dev_set_mtu(br->dev, br_min_mtu(br));
+		dev_set_mtu(br->dev, br_mtu(br));
 		break;
 
 	case NETDEV_CHANGEADDR:
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 1285ca30ab0a..278fc999d355 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -224,7 +224,7 @@ static void br_get_stats64(struct net_device *dev,
 static int br_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct net_bridge *br = netdev_priv(dev);
-	if (new_mtu > br_min_mtu(br))
+	if (new_mtu > br_mtu(br))
 		return -EINVAL;
 
 	dev->mtu = new_mtu;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 9ba4ed65c52b..48dc4d2e2be3 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -424,8 +424,18 @@ int br_del_bridge(struct net *net, const char *name)
 	return ret;
 }
 
+static bool min_mtu(int a, int b)
+{
+	return a < b ? 1 : 0;
+}
+
+static bool max_mtu(int a, int b)
+{
+	return a > b ? 1 : 0;
+}
+
 /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
-int br_min_mtu(const struct net_bridge *br)
+static int __br_mtu(const struct net_bridge *br, bool (compare_fn)(int, int))
 {
 	const struct net_bridge_port *p;
 	int mtu = 0;
@@ -436,13 +446,21 @@ int br_min_mtu(const struct net_bridge *br)
 		mtu = ETH_DATA_LEN;
 	else {
 		list_for_each_entry(p, &br->port_list, list) {
-			if (!mtu  || p->dev->mtu < mtu)
+			if (!mtu || compare_fn(p->dev->mtu, mtu))
 				mtu = p->dev->mtu;
 		}
 	}
 	return mtu;
 }
 
+int br_mtu(const struct net_bridge *br)
+{
+	if (br->vlan_enabled)
+		return __br_mtu(br, max_mtu);
+	else
+		return __br_mtu(br, min_mtu);
+}
+
 static void br_set_gso_limits(struct net_bridge *br)
 {
 	unsigned int gso_max_size = GSO_MAX_SIZE;
@@ -594,7 +612,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	if (changed_addr)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
 
-	dev_set_mtu(br->dev, br_min_mtu(br));
+	dev_set_mtu(br->dev, br_mtu(br));
 	br_set_gso_limits(br);
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
@@ -641,7 +659,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
 	 */
 	del_nbp(p);
 
-	dev_set_mtu(br->dev, br_min_mtu(br));
+	dev_set_mtu(br->dev, br_mtu(br));
 	br_set_gso_limits(br);
 
 	spin_lock_bh(&br->lock);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8e13a64d8c99..048d5b51813b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -578,7 +578,7 @@ int br_del_bridge(struct net *net, const char *name);
 int br_add_if(struct net_bridge *br, struct net_device *dev,
 	      struct netlink_ext_ack *extack);
 int br_del_if(struct net_bridge *br, struct net_device *dev);
-int br_min_mtu(const struct net_bridge *br);
+int br_mtu(const struct net_bridge *br);
 netdev_features_t br_features_recompute(struct net_bridge *br,
 					netdev_features_t features);
 void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
-- 
cgit v1.2.3


From 69ca9293e8dd9323c6cde579e1855d6ce9489a46 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 22 Mar 2018 10:09:53 -0700
Subject: tls: Generalize zerocopy_from_iter

Refactor zerocopy_from_iter to take arguments for pages and size,
such that it can be used for both tx and rx. RX will also support
zerocopy direct to output iter, as long as the full message can
be copied at once (a large enough userspace buffer was provided).

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 057a558ed6d7..ca1d20de3d2c 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -226,23 +226,24 @@ static int tls_sw_push_pending_record(struct sock *sk, int flags)
 }
 
 static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
-			      int length)
+			      int length, int *pages_used,
+			      unsigned int *size_used,
+			      struct scatterlist *to, int to_max_pages,
+			      bool charge)
 {
-	struct tls_context *tls_ctx = tls_get_ctx(sk);
-	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
 	struct page *pages[MAX_SKB_FRAGS];
 
 	size_t offset;
 	ssize_t copied, use;
 	int i = 0;
-	unsigned int size = ctx->sg_plaintext_size;
-	int num_elem = ctx->sg_plaintext_num_elem;
+	unsigned int size = *size_used;
+	int num_elem = *pages_used;
 	int rc = 0;
 	int maxpages;
 
 	while (length > 0) {
 		i = 0;
-		maxpages = ARRAY_SIZE(ctx->sg_plaintext_data) - num_elem;
+		maxpages = to_max_pages - num_elem;
 		if (maxpages == 0) {
 			rc = -EFAULT;
 			goto out;
@@ -262,10 +263,11 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 		while (copied) {
 			use = min_t(int, copied, PAGE_SIZE - offset);
 
-			sg_set_page(&ctx->sg_plaintext_data[num_elem],
+			sg_set_page(&to[num_elem],
 				    pages[i], use, offset);
-			sg_unmark_end(&ctx->sg_plaintext_data[num_elem]);
-			sk_mem_charge(sk, use);
+			sg_unmark_end(&to[num_elem]);
+			if (charge)
+				sk_mem_charge(sk, use);
 
 			offset = 0;
 			copied -= use;
@@ -276,8 +278,9 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
 	}
 
 out:
-	ctx->sg_plaintext_size = size;
-	ctx->sg_plaintext_num_elem = num_elem;
+	*size_used = size;
+	*pages_used = num_elem;
+
 	return rc;
 }
 
@@ -374,7 +377,11 @@ alloc_encrypted:
 
 		if (full_record || eor) {
 			ret = zerocopy_from_iter(sk, &msg->msg_iter,
-						 try_to_copy);
+				try_to_copy, &ctx->sg_plaintext_num_elem,
+				&ctx->sg_plaintext_size,
+				ctx->sg_plaintext_data,
+				ARRAY_SIZE(ctx->sg_plaintext_data),
+				true);
 			if (ret)
 				goto fallback_to_reg_send;
 
-- 
cgit v1.2.3


From dbe425599ba05c7415f632e6f5f018453098eb69 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 22 Mar 2018 10:10:06 -0700
Subject: tls: Move cipher info to a separate struct

Separate tx crypto parameters to a separate cipher_context struct.
The same parameters will be used for rx using the same struct.

tls_advance_record_sn is modified to only take the cipher info.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_main.c |  8 ++++----
 net/tls/tls_sw.c   | 58 ++++++++++++++++++++++++++++--------------------------
 2 files changed, 34 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index d824d548447e..c671560b832b 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -259,8 +259,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 		}
 	}
 
-	kfree(ctx->rec_seq);
-	kfree(ctx->iv);
+	kfree(ctx->tx.rec_seq);
+	kfree(ctx->tx.iv);
 
 	if (ctx->tx_conf == TLS_SW_TX)
 		tls_sw_free_tx_resources(sk);
@@ -319,9 +319,9 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
 		}
 		lock_sock(sk);
 		memcpy(crypto_info_aes_gcm_128->iv,
-		       ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+		       ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
 		       TLS_CIPHER_AES_GCM_128_IV_SIZE);
-		memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->rec_seq,
+		memcpy(crypto_info_aes_gcm_128->rec_seq, ctx->tx.rec_seq,
 		       TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE);
 		release_sock(sk);
 		if (copy_to_user(optval,
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index ca1d20de3d2c..338d743bcc21 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -79,7 +79,7 @@ static void trim_both_sgl(struct sock *sk, int target_size)
 		target_size);
 
 	if (target_size > 0)
-		target_size += tls_ctx->overhead_size;
+		target_size += tls_ctx->tx.overhead_size;
 
 	trim_sg(sk, ctx->sg_encrypted_data,
 		&ctx->sg_encrypted_num_elem,
@@ -152,21 +152,21 @@ static int tls_do_encryption(struct tls_context *tls_ctx,
 	if (!aead_req)
 		return -ENOMEM;
 
-	ctx->sg_encrypted_data[0].offset += tls_ctx->prepend_size;
-	ctx->sg_encrypted_data[0].length -= tls_ctx->prepend_size;
+	ctx->sg_encrypted_data[0].offset += tls_ctx->tx.prepend_size;
+	ctx->sg_encrypted_data[0].length -= tls_ctx->tx.prepend_size;
 
 	aead_request_set_tfm(aead_req, ctx->aead_send);
 	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
 	aead_request_set_crypt(aead_req, ctx->sg_aead_in, ctx->sg_aead_out,
-			       data_len, tls_ctx->iv);
+			       data_len, tls_ctx->tx.iv);
 
 	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				  crypto_req_done, &ctx->async_wait);
 
 	rc = crypto_wait_req(crypto_aead_encrypt(aead_req), &ctx->async_wait);
 
-	ctx->sg_encrypted_data[0].offset -= tls_ctx->prepend_size;
-	ctx->sg_encrypted_data[0].length += tls_ctx->prepend_size;
+	ctx->sg_encrypted_data[0].offset -= tls_ctx->tx.prepend_size;
+	ctx->sg_encrypted_data[0].length += tls_ctx->tx.prepend_size;
 
 	kfree(aead_req);
 	return rc;
@@ -183,7 +183,7 @@ static int tls_push_record(struct sock *sk, int flags,
 	sg_mark_end(ctx->sg_encrypted_data + ctx->sg_encrypted_num_elem - 1);
 
 	tls_make_aad(ctx->aad_space, ctx->sg_plaintext_size,
-		     tls_ctx->rec_seq, tls_ctx->rec_seq_size,
+		     tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size,
 		     record_type);
 
 	tls_fill_prepend(tls_ctx,
@@ -216,7 +216,7 @@ static int tls_push_record(struct sock *sk, int flags,
 	if (rc < 0 && rc != -EAGAIN)
 		tls_err_abort(sk);
 
-	tls_advance_record_sn(sk, tls_ctx);
+	tls_advance_record_sn(sk, &tls_ctx->tx);
 	return rc;
 }
 
@@ -357,7 +357,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
 		}
 
 		required_size = ctx->sg_plaintext_size + try_to_copy +
-				tls_ctx->overhead_size;
+				tls_ctx->tx.overhead_size;
 
 		if (!sk_stream_memory_free(sk))
 			goto wait_for_sndbuf;
@@ -420,7 +420,7 @@ alloc_plaintext:
 				&ctx->sg_encrypted_num_elem,
 				&ctx->sg_encrypted_size,
 				ctx->sg_plaintext_size +
-				tls_ctx->overhead_size);
+				tls_ctx->tx.overhead_size);
 		}
 
 		ret = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy);
@@ -512,7 +512,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 			full_record = true;
 		}
 		required_size = ctx->sg_plaintext_size + copy +
-			      tls_ctx->overhead_size;
+			      tls_ctx->tx.overhead_size;
 
 		if (!sk_stream_memory_free(sk))
 			goto wait_for_sndbuf;
@@ -644,24 +644,26 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 		goto free_priv;
 	}
 
-	ctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
-	ctx->tag_size = tag_size;
-	ctx->overhead_size = ctx->prepend_size + ctx->tag_size;
-	ctx->iv_size = iv_size;
-	ctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL);
-	if (!ctx->iv) {
+	ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
+	ctx->tx.tag_size = tag_size;
+	ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
+	ctx->tx.iv_size = iv_size;
+	ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			     GFP_KERNEL);
+	if (!ctx->tx.iv) {
 		rc = -ENOMEM;
 		goto free_priv;
 	}
-	memcpy(ctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
-	memcpy(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
-	ctx->rec_seq_size = rec_seq_size;
-	ctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
-	if (!ctx->rec_seq) {
+	memcpy(ctx->tx.iv, gcm_128_info->salt,
+	       TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+	ctx->tx.rec_seq_size = rec_seq_size;
+	ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+	if (!ctx->tx.rec_seq) {
 		rc = -ENOMEM;
 		goto free_iv;
 	}
-	memcpy(ctx->rec_seq, rec_seq, rec_seq_size);
+	memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
 
 	sg_init_table(sw_ctx->sg_encrypted_data,
 		      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
@@ -697,7 +699,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 	if (rc)
 		goto free_aead;
 
-	rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tag_size);
+	rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size);
 	if (!rc)
 		return 0;
 
@@ -705,11 +707,11 @@ free_aead:
 	crypto_free_aead(sw_ctx->aead_send);
 	sw_ctx->aead_send = NULL;
 free_rec_seq:
-	kfree(ctx->rec_seq);
-	ctx->rec_seq = NULL;
+	kfree(ctx->tx.rec_seq);
+	ctx->tx.rec_seq = NULL;
 free_iv:
-	kfree(ctx->iv);
-	ctx->iv = NULL;
+	kfree(ctx->tx.iv);
+	ctx->tx.iv = NULL;
 free_priv:
 	kfree(ctx->priv_ctx);
 	ctx->priv_ctx = NULL;
-- 
cgit v1.2.3


From f4a8e43f1f0abc0e93ed5ee132288ee4142afde1 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 22 Mar 2018 10:10:15 -0700
Subject: tls: Pass error code explicitly to tls_err_abort

Pass EBADMSG explicitly to tls_err_abort.  Receive path will
pass additional codes - EMSGSIZE if framing is larger than max
TLS record size, EINVAL if TLS version mismatch.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 338d743bcc21..1c79d9ad1731 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -214,7 +214,7 @@ static int tls_push_record(struct sock *sk, int flags,
 	/* Only pass through MSG_DONTWAIT and MSG_NOSIGNAL flags */
 	rc = tls_push_sg(sk, tls_ctx, ctx->sg_encrypted_data, 0, flags);
 	if (rc < 0 && rc != -EAGAIN)
-		tls_err_abort(sk);
+		tls_err_abort(sk, EBADMSG);
 
 	tls_advance_record_sn(sk, &tls_ctx->tx);
 	return rc;
-- 
cgit v1.2.3


From 583715853a25b4f2720b847e4fb8e37727299152 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 22 Mar 2018 10:10:26 -0700
Subject: tls: Refactor variable names

Several config variables are prefixed with tx, drop the prefix
since these will be used for both tx and rx.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_main.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index c671560b832b..c405beeda765 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -52,7 +52,7 @@ enum {
 };
 
 enum {
-	TLS_BASE_TX,
+	TLS_BASE,
 	TLS_SW_TX,
 	TLS_NUM_CONFIG,
 };
@@ -65,7 +65,7 @@ static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
 	int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
-	sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf];
+	sk->sk_prot = &tls_prots[ip_ver][ctx->conf];
 }
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
@@ -238,7 +238,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	lock_sock(sk);
 	sk_proto_close = ctx->sk_proto_close;
 
-	if (ctx->tx_conf == TLS_BASE_TX) {
+	if (ctx->conf == TLS_BASE) {
 		kfree(ctx);
 		goto skip_tx_cleanup;
 	}
@@ -262,7 +262,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	kfree(ctx->tx.rec_seq);
 	kfree(ctx->tx.iv);
 
-	if (ctx->tx_conf == TLS_SW_TX)
+	if (ctx->conf == TLS_SW_TX)
 		tls_sw_free_tx_resources(sk);
 
 skip_tx_cleanup:
@@ -371,7 +371,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 	struct tls_crypto_info *crypto_info;
 	struct tls_context *ctx = tls_get_ctx(sk);
 	int rc = 0;
-	int tx_conf;
+	int conf;
 
 	if (!optval || (optlen < sizeof(*crypto_info))) {
 		rc = -EINVAL;
@@ -418,11 +418,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 
 	/* currently SW is default, we will have ethtool in future */
 	rc = tls_set_sw_offload(sk, ctx);
-	tx_conf = TLS_SW_TX;
+	conf = TLS_SW_TX;
 	if (rc)
 		goto err_crypto_info;
 
-	ctx->tx_conf = tx_conf;
+	ctx->conf = conf;
 	update_sk_prot(sk, ctx);
 	ctx->sk_write_space = sk->sk_write_space;
 	sk->sk_write_space = tls_write_space;
@@ -465,12 +465,12 @@ static int tls_setsockopt(struct sock *sk, int level, int optname,
 
 static void build_protos(struct proto *prot, struct proto *base)
 {
-	prot[TLS_BASE_TX] = *base;
-	prot[TLS_BASE_TX].setsockopt	= tls_setsockopt;
-	prot[TLS_BASE_TX].getsockopt	= tls_getsockopt;
-	prot[TLS_BASE_TX].close		= tls_sk_proto_close;
+	prot[TLS_BASE] = *base;
+	prot[TLS_BASE].setsockopt	= tls_setsockopt;
+	prot[TLS_BASE].getsockopt	= tls_getsockopt;
+	prot[TLS_BASE].close		= tls_sk_proto_close;
 
-	prot[TLS_SW_TX] = prot[TLS_BASE_TX];
+	prot[TLS_SW_TX] = prot[TLS_BASE];
 	prot[TLS_SW_TX].sendmsg		= tls_sw_sendmsg;
 	prot[TLS_SW_TX].sendpage	= tls_sw_sendpage;
 }
@@ -513,7 +513,7 @@ static int tls_init(struct sock *sk)
 		mutex_unlock(&tcpv6_prot_mutex);
 	}
 
-	ctx->tx_conf = TLS_BASE_TX;
+	ctx->conf = TLS_BASE;
 	update_sk_prot(sk, ctx);
 out:
 	return rc;
-- 
cgit v1.2.3


From c46234ebb4d1eee5e09819f49169e51cfc6eb909 Mon Sep 17 00:00:00 2001
From: Dave Watson <davejwatson@fb.com>
Date: Thu, 22 Mar 2018 10:10:35 -0700
Subject: tls: RX path for ktls

Add rx path for tls software implementation.

recvmsg, splice_read, and poll implemented.

An additional sockopt TLS_RX is added, with the same interface as
TLS_TX.  Either TLX_RX or TLX_TX may be provided separately, or
together (with two different setsockopt calls with appropriate keys).

Control messages are passed via CMSG in a similar way to transmit.
If no cmsg buffer is passed, then only application data records
will be passed to userspace, and EIO is returned for other types of
alerts.

EBADMSG is passed for decryption errors, and EMSGSIZE is passed for
framing too big, and EBADMSG for framing too small (matching openssl
semantics). EINVAL is returned for TLS versions that do not match the
original setsockopt call.  All are unrecoverable.

strparser is used to parse TLS framing.   Decryption is done directly
in to userspace buffers if they are large enough to support it, otherwise
sk_cow_data is called (similar to ipsec), and buffers are decrypted in
place and copied.  splice_read always decrypts in place, since no
buffers are provided to decrypt in to.

sk_poll is overridden, and only returns POLLIN if a full TLS message is
received.  Otherwise we wait for strparser to finish reading a full frame.
Actual decryption is only done during recvmsg or splice_read calls.

Signed-off-by: Dave Watson <davejwatson@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/Kconfig    |   1 +
 net/tls/tls_main.c |  62 +++++-
 net/tls/tls_sw.c   | 587 +++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 582 insertions(+), 68 deletions(-)

(limited to 'net')

diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index eb583038c67e..89b8745a986f 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -7,6 +7,7 @@ config TLS
 	select CRYPTO
 	select CRYPTO_AES
 	select CRYPTO_GCM
+	select STREAM_PARSER
 	default n
 	---help---
 	Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index c405beeda765..6f5c1146da4a 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -54,12 +54,15 @@ enum {
 enum {
 	TLS_BASE,
 	TLS_SW_TX,
+	TLS_SW_RX,
+	TLS_SW_RXTX,
 	TLS_NUM_CONFIG,
 };
 
 static struct proto *saved_tcpv6_prot;
 static DEFINE_MUTEX(tcpv6_prot_mutex);
 static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
+static struct proto_ops tls_sw_proto_ops;
 
 static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
 {
@@ -261,9 +264,14 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 
 	kfree(ctx->tx.rec_seq);
 	kfree(ctx->tx.iv);
+	kfree(ctx->rx.rec_seq);
+	kfree(ctx->rx.iv);
 
-	if (ctx->conf == TLS_SW_TX)
-		tls_sw_free_tx_resources(sk);
+	if (ctx->conf == TLS_SW_TX ||
+	    ctx->conf == TLS_SW_RX ||
+	    ctx->conf == TLS_SW_RXTX) {
+		tls_sw_free_resources(sk);
+	}
 
 skip_tx_cleanup:
 	release_sock(sk);
@@ -365,8 +373,8 @@ static int tls_getsockopt(struct sock *sk, int level, int optname,
 	return do_tls_getsockopt(sk, optname, optval, optlen);
 }
 
-static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
-				unsigned int optlen)
+static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
+				  unsigned int optlen, int tx)
 {
 	struct tls_crypto_info *crypto_info;
 	struct tls_context *ctx = tls_get_ctx(sk);
@@ -378,7 +386,11 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 		goto out;
 	}
 
-	crypto_info = &ctx->crypto_send;
+	if (tx)
+		crypto_info = &ctx->crypto_send;
+	else
+		crypto_info = &ctx->crypto_recv;
+
 	/* Currently we don't support set crypto info more than one time */
 	if (TLS_CRYPTO_INFO_READY(crypto_info)) {
 		rc = -EBUSY;
@@ -417,15 +429,31 @@ static int do_tls_setsockopt_tx(struct sock *sk, char __user *optval,
 	}
 
 	/* currently SW is default, we will have ethtool in future */
-	rc = tls_set_sw_offload(sk, ctx);
-	conf = TLS_SW_TX;
+	if (tx) {
+		rc = tls_set_sw_offload(sk, ctx, 1);
+		if (ctx->conf == TLS_SW_RX)
+			conf = TLS_SW_RXTX;
+		else
+			conf = TLS_SW_TX;
+	} else {
+		rc = tls_set_sw_offload(sk, ctx, 0);
+		if (ctx->conf == TLS_SW_TX)
+			conf = TLS_SW_RXTX;
+		else
+			conf = TLS_SW_RX;
+	}
+
 	if (rc)
 		goto err_crypto_info;
 
 	ctx->conf = conf;
 	update_sk_prot(sk, ctx);
-	ctx->sk_write_space = sk->sk_write_space;
-	sk->sk_write_space = tls_write_space;
+	if (tx) {
+		ctx->sk_write_space = sk->sk_write_space;
+		sk->sk_write_space = tls_write_space;
+	} else {
+		sk->sk_socket->ops = &tls_sw_proto_ops;
+	}
 	goto out;
 
 err_crypto_info:
@@ -441,8 +469,10 @@ static int do_tls_setsockopt(struct sock *sk, int optname,
 
 	switch (optname) {
 	case TLS_TX:
+	case TLS_RX:
 		lock_sock(sk);
-		rc = do_tls_setsockopt_tx(sk, optval, optlen);
+		rc = do_tls_setsockopt_conf(sk, optval, optlen,
+					    optname == TLS_TX);
 		release_sock(sk);
 		break;
 	default:
@@ -473,6 +503,14 @@ static void build_protos(struct proto *prot, struct proto *base)
 	prot[TLS_SW_TX] = prot[TLS_BASE];
 	prot[TLS_SW_TX].sendmsg		= tls_sw_sendmsg;
 	prot[TLS_SW_TX].sendpage	= tls_sw_sendpage;
+
+	prot[TLS_SW_RX] = prot[TLS_BASE];
+	prot[TLS_SW_RX].recvmsg		= tls_sw_recvmsg;
+	prot[TLS_SW_RX].close		= tls_sk_proto_close;
+
+	prot[TLS_SW_RXTX] = prot[TLS_SW_TX];
+	prot[TLS_SW_RXTX].recvmsg	= tls_sw_recvmsg;
+	prot[TLS_SW_RXTX].close		= tls_sk_proto_close;
 }
 
 static int tls_init(struct sock *sk)
@@ -531,6 +569,10 @@ static int __init tls_register(void)
 {
 	build_protos(tls_prots[TLSV4], &tcp_prot);
 
+	tls_sw_proto_ops = inet_stream_ops;
+	tls_sw_proto_ops.poll = tls_sw_poll;
+	tls_sw_proto_ops.splice_read = tls_sw_splice_read;
+
 	tcp_register_ulp(&tcp_tls_ulp_ops);
 
 	return 0;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 1c79d9ad1731..4dc766b03f00 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -34,11 +34,60 @@
  * SOFTWARE.
  */
 
+#include <linux/sched/signal.h>
 #include <linux/module.h>
 #include <crypto/aead.h>
 
+#include <net/strparser.h>
 #include <net/tls.h>
 
+static int tls_do_decryption(struct sock *sk,
+			     struct scatterlist *sgin,
+			     struct scatterlist *sgout,
+			     char *iv_recv,
+			     size_t data_len,
+			     struct sk_buff *skb,
+			     gfp_t flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm = strp_msg(skb);
+	struct aead_request *aead_req;
+
+	int ret;
+	unsigned int req_size = sizeof(struct aead_request) +
+		crypto_aead_reqsize(ctx->aead_recv);
+
+	aead_req = kzalloc(req_size, flags);
+	if (!aead_req)
+		return -ENOMEM;
+
+	aead_request_set_tfm(aead_req, ctx->aead_recv);
+	aead_request_set_ad(aead_req, TLS_AAD_SPACE_SIZE);
+	aead_request_set_crypt(aead_req, sgin, sgout,
+			       data_len + tls_ctx->rx.tag_size,
+			       (u8 *)iv_recv);
+	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  crypto_req_done, &ctx->async_wait);
+
+	ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait);
+
+	if (ret < 0)
+		goto out;
+
+	rxm->offset += tls_ctx->rx.prepend_size;
+	rxm->full_len -= tls_ctx->rx.overhead_size;
+	tls_advance_record_sn(sk, &tls_ctx->rx);
+
+	ctx->decrypted = true;
+
+	ctx->saved_data_ready(sk);
+
+out:
+	kfree(aead_req);
+	return ret;
+}
+
 static void trim_sg(struct sock *sk, struct scatterlist *sg,
 		    int *sg_num_elem, unsigned int *sg_size, int target_size)
 {
@@ -581,13 +630,404 @@ sendpage_end:
 	return ret;
 }
 
-void tls_sw_free_tx_resources(struct sock *sk)
+static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
+				     long timeo, int *err)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct sk_buff *skb;
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	while (!(skb = ctx->recv_pkt)) {
+		if (sk->sk_err) {
+			*err = sock_error(sk);
+			return NULL;
+		}
+
+		if (sock_flag(sk, SOCK_DONE))
+			return NULL;
+
+		if ((flags & MSG_DONTWAIT) || !timeo) {
+			*err = -EAGAIN;
+			return NULL;
+		}
+
+		add_wait_queue(sk_sleep(sk), &wait);
+		sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+		sk_wait_event(sk, &timeo, ctx->recv_pkt != skb, &wait);
+		sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+		remove_wait_queue(sk_sleep(sk), &wait);
+
+		/* Handle signals */
+		if (signal_pending(current)) {
+			*err = sock_intr_errno(timeo);
+			return NULL;
+		}
+	}
+
+	return skb;
+}
+
+static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+		       struct scatterlist *sgout)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	char iv[TLS_CIPHER_AES_GCM_128_SALT_SIZE + tls_ctx->rx.iv_size];
+	struct scatterlist sgin_arr[MAX_SKB_FRAGS + 2];
+	struct scatterlist *sgin = &sgin_arr[0];
+	struct strp_msg *rxm = strp_msg(skb);
+	int ret, nsg = ARRAY_SIZE(sgin_arr);
+	char aad_recv[TLS_AAD_SPACE_SIZE];
+	struct sk_buff *unused;
+
+	ret = skb_copy_bits(skb, rxm->offset + TLS_HEADER_SIZE,
+			    iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			    tls_ctx->rx.iv_size);
+	if (ret < 0)
+		return ret;
+
+	memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	if (!sgout) {
+		nsg = skb_cow_data(skb, 0, &unused) + 1;
+		sgin = kmalloc_array(nsg, sizeof(*sgin), sk->sk_allocation);
+		if (!sgout)
+			sgout = sgin;
+	}
+
+	sg_init_table(sgin, nsg);
+	sg_set_buf(&sgin[0], aad_recv, sizeof(aad_recv));
+
+	nsg = skb_to_sgvec(skb, &sgin[1],
+			   rxm->offset + tls_ctx->rx.prepend_size,
+			   rxm->full_len - tls_ctx->rx.prepend_size);
+
+	tls_make_aad(aad_recv,
+		     rxm->full_len - tls_ctx->rx.overhead_size,
+		     tls_ctx->rx.rec_seq,
+		     tls_ctx->rx.rec_seq_size,
+		     ctx->control);
+
+	ret = tls_do_decryption(sk, sgin, sgout, iv,
+				rxm->full_len - tls_ctx->rx.overhead_size,
+				skb, sk->sk_allocation);
+
+	if (sgin != &sgin_arr[0])
+		kfree(sgin);
+
+	return ret;
+}
+
+static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
+			       unsigned int len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm = strp_msg(skb);
+
+	if (len < rxm->full_len) {
+		rxm->offset += len;
+		rxm->full_len -= len;
+
+		return false;
+	}
+
+	/* Finished with message */
+	ctx->recv_pkt = NULL;
+	kfree_skb(skb);
+	strp_unpause(&ctx->strp);
+
+	return true;
+}
+
+int tls_sw_recvmsg(struct sock *sk,
+		   struct msghdr *msg,
+		   size_t len,
+		   int nonblock,
+		   int flags,
+		   int *addr_len)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	unsigned char control;
+	struct strp_msg *rxm;
+	struct sk_buff *skb;
+	ssize_t copied = 0;
+	bool cmsg = false;
+	int err = 0;
+	long timeo;
+
+	flags |= nonblock;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR);
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+	do {
+		bool zc = false;
+		int chunk = 0;
+
+		skb = tls_wait_data(sk, flags, timeo, &err);
+		if (!skb)
+			goto recv_end;
+
+		rxm = strp_msg(skb);
+		if (!cmsg) {
+			int cerr;
+
+			cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE,
+					sizeof(ctx->control), &ctx->control);
+			cmsg = true;
+			control = ctx->control;
+			if (ctx->control != TLS_RECORD_TYPE_DATA) {
+				if (cerr || msg->msg_flags & MSG_CTRUNC) {
+					err = -EIO;
+					goto recv_end;
+				}
+			}
+		} else if (control != ctx->control) {
+			goto recv_end;
+		}
+
+		if (!ctx->decrypted) {
+			int page_count;
+			int to_copy;
+
+			page_count = iov_iter_npages(&msg->msg_iter,
+						     MAX_SKB_FRAGS);
+			to_copy = rxm->full_len - tls_ctx->rx.overhead_size;
+			if (to_copy <= len && page_count < MAX_SKB_FRAGS &&
+			    likely(!(flags & MSG_PEEK)))  {
+				struct scatterlist sgin[MAX_SKB_FRAGS + 1];
+				char unused[21];
+				int pages = 0;
+
+				zc = true;
+				sg_init_table(sgin, MAX_SKB_FRAGS + 1);
+				sg_set_buf(&sgin[0], unused, 13);
+
+				err = zerocopy_from_iter(sk, &msg->msg_iter,
+							 to_copy, &pages,
+							 &chunk, &sgin[1],
+							 MAX_SKB_FRAGS,	false);
+				if (err < 0)
+					goto fallback_to_reg_recv;
+
+				err = decrypt_skb(sk, skb, sgin);
+				for (; pages > 0; pages--)
+					put_page(sg_page(&sgin[pages]));
+				if (err < 0) {
+					tls_err_abort(sk, EBADMSG);
+					goto recv_end;
+				}
+			} else {
+fallback_to_reg_recv:
+				err = decrypt_skb(sk, skb, NULL);
+				if (err < 0) {
+					tls_err_abort(sk, EBADMSG);
+					goto recv_end;
+				}
+			}
+			ctx->decrypted = true;
+		}
+
+		if (!zc) {
+			chunk = min_t(unsigned int, rxm->full_len, len);
+			err = skb_copy_datagram_msg(skb, rxm->offset, msg,
+						    chunk);
+			if (err < 0)
+				goto recv_end;
+		}
+
+		copied += chunk;
+		len -= chunk;
+		if (likely(!(flags & MSG_PEEK))) {
+			u8 control = ctx->control;
+
+			if (tls_sw_advance_skb(sk, skb, chunk)) {
+				/* Return full control message to
+				 * userspace before trying to parse
+				 * another message type
+				 */
+				msg->msg_flags |= MSG_EOR;
+				if (control != TLS_RECORD_TYPE_DATA)
+					goto recv_end;
+			}
+		}
+	} while (len);
+
+recv_end:
+	release_sock(sk);
+	return copied ? : err;
+}
+
+ssize_t tls_sw_splice_read(struct socket *sock,  loff_t *ppos,
+			   struct pipe_inode_info *pipe,
+			   size_t len, unsigned int flags)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sock->sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm = NULL;
+	struct sock *sk = sock->sk;
+	struct sk_buff *skb;
+	ssize_t copied = 0;
+	int err = 0;
+	long timeo;
+	int chunk;
+
+	lock_sock(sk);
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	skb = tls_wait_data(sk, flags, timeo, &err);
+	if (!skb)
+		goto splice_read_end;
+
+	/* splice does not support reading control messages */
+	if (ctx->control != TLS_RECORD_TYPE_DATA) {
+		err = -ENOTSUPP;
+		goto splice_read_end;
+	}
+
+	if (!ctx->decrypted) {
+		err = decrypt_skb(sk, skb, NULL);
+
+		if (err < 0) {
+			tls_err_abort(sk, EBADMSG);
+			goto splice_read_end;
+		}
+		ctx->decrypted = true;
+	}
+	rxm = strp_msg(skb);
+
+	chunk = min_t(unsigned int, rxm->full_len, len);
+	copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags);
+	if (copied < 0)
+		goto splice_read_end;
+
+	if (likely(!(flags & MSG_PEEK)))
+		tls_sw_advance_skb(sk, skb, copied);
+
+splice_read_end:
+	release_sock(sk);
+	return copied ? : err;
+}
+
+unsigned int tls_sw_poll(struct file *file, struct socket *sock,
+			 struct poll_table_struct *wait)
+{
+	unsigned int ret;
+	struct sock *sk = sock->sk;
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	/* Grab POLLOUT and POLLHUP from the underlying socket */
+	ret = ctx->sk_poll(file, sock, wait);
+
+	/* Clear POLLIN bits, and set based on recv_pkt */
+	ret &= ~(POLLIN | POLLRDNORM);
+	if (ctx->recv_pkt)
+		ret |= POLLIN | POLLRDNORM;
+
+	return ret;
+}
+
+static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	char header[tls_ctx->rx.prepend_size];
+	struct strp_msg *rxm = strp_msg(skb);
+	size_t cipher_overhead;
+	size_t data_len = 0;
+	int ret;
+
+	/* Verify that we have a full TLS header, or wait for more data */
+	if (rxm->offset + tls_ctx->rx.prepend_size > skb->len)
+		return 0;
+
+	/* Linearize header to local buffer */
+	ret = skb_copy_bits(skb, rxm->offset, header, tls_ctx->rx.prepend_size);
+
+	if (ret < 0)
+		goto read_failure;
+
+	ctx->control = header[0];
+
+	data_len = ((header[4] & 0xFF) | (header[3] << 8));
+
+	cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size;
+
+	if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) {
+		ret = -EMSGSIZE;
+		goto read_failure;
+	}
+	if (data_len < cipher_overhead) {
+		ret = -EBADMSG;
+		goto read_failure;
+	}
+
+	if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.version) ||
+	    header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.version)) {
+		ret = -EINVAL;
+		goto read_failure;
+	}
+
+	return data_len + TLS_HEADER_SIZE;
+
+read_failure:
+	tls_err_abort(strp->sk, ret);
+
+	return ret;
+}
+
+static void tls_queue(struct strparser *strp, struct sk_buff *skb)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+	struct strp_msg *rxm;
+
+	rxm = strp_msg(skb);
+
+	ctx->decrypted = false;
+
+	ctx->recv_pkt = skb;
+	strp_pause(strp);
+
+	strp->sk->sk_state_change(strp->sk);
+}
+
+static void tls_data_ready(struct sock *sk)
+{
+	struct tls_context *tls_ctx = tls_get_ctx(sk);
+	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
+
+	strp_data_ready(&ctx->strp);
+}
+
+void tls_sw_free_resources(struct sock *sk)
 {
 	struct tls_context *tls_ctx = tls_get_ctx(sk);
 	struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx);
 
 	if (ctx->aead_send)
 		crypto_free_aead(ctx->aead_send);
+	if (ctx->aead_recv) {
+		if (ctx->recv_pkt) {
+			kfree_skb(ctx->recv_pkt);
+			ctx->recv_pkt = NULL;
+		}
+		crypto_free_aead(ctx->aead_recv);
+		strp_stop(&ctx->strp);
+		write_lock_bh(&sk->sk_callback_lock);
+		sk->sk_data_ready = ctx->saved_data_ready;
+		write_unlock_bh(&sk->sk_callback_lock);
+		release_sock(sk);
+		strp_done(&ctx->strp);
+		lock_sock(sk);
+	}
 
 	tls_free_both_sg(sk);
 
@@ -595,12 +1035,15 @@ void tls_sw_free_tx_resources(struct sock *sk)
 	kfree(tls_ctx);
 }
 
-int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
+int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
 {
 	char keyval[TLS_CIPHER_AES_GCM_128_KEY_SIZE];
 	struct tls_crypto_info *crypto_info;
 	struct tls12_crypto_info_aes_gcm_128 *gcm_128_info;
 	struct tls_sw_context *sw_ctx;
+	struct cipher_context *cctx;
+	struct crypto_aead **aead;
+	struct strp_callbacks cb;
 	u16 nonce_size, tag_size, iv_size, rec_seq_size;
 	char *iv, *rec_seq;
 	int rc = 0;
@@ -610,22 +1053,29 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 		goto out;
 	}
 
-	if (ctx->priv_ctx) {
-		rc = -EEXIST;
-		goto out;
-	}
-
-	sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
-	if (!sw_ctx) {
-		rc = -ENOMEM;
-		goto out;
+	if (!ctx->priv_ctx) {
+		sw_ctx = kzalloc(sizeof(*sw_ctx), GFP_KERNEL);
+		if (!sw_ctx) {
+			rc = -ENOMEM;
+			goto out;
+		}
+		crypto_init_wait(&sw_ctx->async_wait);
+	} else {
+		sw_ctx = ctx->priv_ctx;
 	}
 
-	crypto_init_wait(&sw_ctx->async_wait);
-
 	ctx->priv_ctx = (struct tls_offload_context *)sw_ctx;
 
-	crypto_info = &ctx->crypto_send;
+	if (tx) {
+		crypto_info = &ctx->crypto_send;
+		cctx = &ctx->tx;
+		aead = &sw_ctx->aead_send;
+	} else {
+		crypto_info = &ctx->crypto_recv;
+		cctx = &ctx->rx;
+		aead = &sw_ctx->aead_recv;
+	}
+
 	switch (crypto_info->cipher_type) {
 	case TLS_CIPHER_AES_GCM_128: {
 		nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE;
@@ -644,48 +1094,49 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 		goto free_priv;
 	}
 
-	ctx->tx.prepend_size = TLS_HEADER_SIZE + nonce_size;
-	ctx->tx.tag_size = tag_size;
-	ctx->tx.overhead_size = ctx->tx.prepend_size + ctx->tx.tag_size;
-	ctx->tx.iv_size = iv_size;
-	ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
-			     GFP_KERNEL);
-	if (!ctx->tx.iv) {
+	cctx->prepend_size = TLS_HEADER_SIZE + nonce_size;
+	cctx->tag_size = tag_size;
+	cctx->overhead_size = cctx->prepend_size + cctx->tag_size;
+	cctx->iv_size = iv_size;
+	cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE,
+			   GFP_KERNEL);
+	if (!cctx->iv) {
 		rc = -ENOMEM;
 		goto free_priv;
 	}
-	memcpy(ctx->tx.iv, gcm_128_info->salt,
-	       TLS_CIPHER_AES_GCM_128_SALT_SIZE);
-	memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
-	ctx->tx.rec_seq_size = rec_seq_size;
-	ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
-	if (!ctx->tx.rec_seq) {
+	memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
+	memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
+	cctx->rec_seq_size = rec_seq_size;
+	cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+	if (!cctx->rec_seq) {
 		rc = -ENOMEM;
 		goto free_iv;
 	}
-	memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
-
-	sg_init_table(sw_ctx->sg_encrypted_data,
-		      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
-	sg_init_table(sw_ctx->sg_plaintext_data,
-		      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
-
-	sg_init_table(sw_ctx->sg_aead_in, 2);
-	sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
-		   sizeof(sw_ctx->aad_space));
-	sg_unmark_end(&sw_ctx->sg_aead_in[1]);
-	sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
-	sg_init_table(sw_ctx->sg_aead_out, 2);
-	sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
-		   sizeof(sw_ctx->aad_space));
-	sg_unmark_end(&sw_ctx->sg_aead_out[1]);
-	sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
-
-	if (!sw_ctx->aead_send) {
-		sw_ctx->aead_send = crypto_alloc_aead("gcm(aes)", 0, 0);
-		if (IS_ERR(sw_ctx->aead_send)) {
-			rc = PTR_ERR(sw_ctx->aead_send);
-			sw_ctx->aead_send = NULL;
+	memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
+
+	if (tx) {
+		sg_init_table(sw_ctx->sg_encrypted_data,
+			      ARRAY_SIZE(sw_ctx->sg_encrypted_data));
+		sg_init_table(sw_ctx->sg_plaintext_data,
+			      ARRAY_SIZE(sw_ctx->sg_plaintext_data));
+
+		sg_init_table(sw_ctx->sg_aead_in, 2);
+		sg_set_buf(&sw_ctx->sg_aead_in[0], sw_ctx->aad_space,
+			   sizeof(sw_ctx->aad_space));
+		sg_unmark_end(&sw_ctx->sg_aead_in[1]);
+		sg_chain(sw_ctx->sg_aead_in, 2, sw_ctx->sg_plaintext_data);
+		sg_init_table(sw_ctx->sg_aead_out, 2);
+		sg_set_buf(&sw_ctx->sg_aead_out[0], sw_ctx->aad_space,
+			   sizeof(sw_ctx->aad_space));
+		sg_unmark_end(&sw_ctx->sg_aead_out[1]);
+		sg_chain(sw_ctx->sg_aead_out, 2, sw_ctx->sg_encrypted_data);
+	}
+
+	if (!*aead) {
+		*aead = crypto_alloc_aead("gcm(aes)", 0, 0);
+		if (IS_ERR(*aead)) {
+			rc = PTR_ERR(*aead);
+			*aead = NULL;
 			goto free_rec_seq;
 		}
 	}
@@ -694,21 +1145,41 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx)
 
 	memcpy(keyval, gcm_128_info->key, TLS_CIPHER_AES_GCM_128_KEY_SIZE);
 
-	rc = crypto_aead_setkey(sw_ctx->aead_send, keyval,
+	rc = crypto_aead_setkey(*aead, keyval,
 				TLS_CIPHER_AES_GCM_128_KEY_SIZE);
 	if (rc)
 		goto free_aead;
 
-	rc = crypto_aead_setauthsize(sw_ctx->aead_send, ctx->tx.tag_size);
-	if (!rc)
-		return 0;
+	rc = crypto_aead_setauthsize(*aead, cctx->tag_size);
+	if (rc)
+		goto free_aead;
+
+	if (!tx) {
+		/* Set up strparser */
+		memset(&cb, 0, sizeof(cb));
+		cb.rcv_msg = tls_queue;
+		cb.parse_msg = tls_read_size;
+
+		strp_init(&sw_ctx->strp, sk, &cb);
+
+		write_lock_bh(&sk->sk_callback_lock);
+		sw_ctx->saved_data_ready = sk->sk_data_ready;
+		sk->sk_data_ready = tls_data_ready;
+		write_unlock_bh(&sk->sk_callback_lock);
+
+		sw_ctx->sk_poll = sk->sk_socket->ops->poll;
+
+		strp_check_rcv(&sw_ctx->strp);
+	}
+
+	goto out;
 
 free_aead:
-	crypto_free_aead(sw_ctx->aead_send);
-	sw_ctx->aead_send = NULL;
+	crypto_free_aead(*aead);
+	*aead = NULL;
 free_rec_seq:
-	kfree(ctx->tx.rec_seq);
-	ctx->tx.rec_seq = NULL;
+	kfree(cctx->rec_seq);
+	cctx->rec_seq = NULL;
 free_iv:
 	kfree(ctx->tx.iv);
 	ctx->tx.iv = NULL;
-- 
cgit v1.2.3


From 82792a070b168a9659c78e6a19efe5b6d1a7ce73 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 23 Mar 2018 18:27:06 +0200
Subject: net: bridge: fix direct access to bridge vlan_enabled and use helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We need to use br_vlan_enabled() helper otherwise we'll break builds
without bridge vlans:
net/bridge//br_if.c: In function ‘br_mtu’:
net/bridge//br_if.c:458:8: error: ‘const struct net_bridge’ has no
member named ‘vlan_enabled’
  if (br->vlan_enabled)
        ^
net/bridge//br_if.c:462:1: warning: control reaches end of non-void
function [-Wreturn-type]
 }
 ^
scripts/Makefile.build:324: recipe for target 'net/bridge//br_if.o'
failed

Fixes: 419d14af9e07 ("bridge: Allow max MTU when multiple VLANs present")
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_if.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 48dc4d2e2be3..87b2afd455c7 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -455,7 +455,7 @@ static int __br_mtu(const struct net_bridge *br, bool (compare_fn)(int, int))
 
 int br_mtu(const struct net_bridge *br)
 {
-	if (br->vlan_enabled)
+	if (br_vlan_enabled(br->dev))
 		return __br_mtu(br, max_mtu);
 	else
 		return __br_mtu(br, min_mtu);
-- 
cgit v1.2.3


From fc18999ed2a217f249e9880363b28a89c839737e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 22 Mar 2018 21:34:46 +0300
Subject: net: Convert udp_sysctl_ops

These pernet_operations just initialize udp4 defaults.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/udp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 908fc02fb4f8..c6dc019bc64b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2842,7 +2842,8 @@ static int __net_init udp_sysctl_init(struct net *net)
 }
 
 static struct pernet_operations __net_initdata udp_sysctl_ops = {
-	.init       = udp_sysctl_init,
+	.init	= udp_sysctl_init,
+	.async	= true,
 };
 
 void __init udp_init(void)
-- 
cgit v1.2.3


From b2864fbdc5ab6bb4750b00580b67070bd0ab3762 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 22 Mar 2018 21:34:55 +0300
Subject: net: Convert rxrpc_net_ops

These pernet_operations modifies rxrpc_net_id-pointed
per-net entities. There is external link to AF_RXRPC
in fs/afs/Kconfig, but it seems there is no other
pernet_operations interested in that per-net entities.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/rxrpc/net_ns.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index f18c9248e0d4..5fd939dabf41 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -106,4 +106,5 @@ struct pernet_operations rxrpc_net_ops = {
 	.exit	= rxrpc_exit_net,
 	.id	= &rxrpc_net_id,
 	.size	= sizeof(struct rxrpc_net),
+	.async	= true,
 };
-- 
cgit v1.2.3


From cb30a63384bc91d5da06e1cede1115f666a29271 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:45 +0100
Subject: tipc: refactor function tipc_enable_bearer()

As a preparation for the next commits we try to reduce the footprint of
the function tipc_enable_bearer(), while hopefully making is simpler to
follow.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c | 136 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 70 insertions(+), 66 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index f3d2e83313e1..e18cb271b005 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -230,88 +230,90 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest)
  * tipc_enable_bearer - enable bearer with the given name
  */
 static int tipc_enable_bearer(struct net *net, const char *name,
-			      u32 disc_domain, u32 priority,
+			      u32 disc_domain, u32 prio,
 			      struct nlattr *attr[])
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(net);
+	struct tipc_bearer_names b_names;
+	u32 self = tipc_own_addr(net);
+	int with_this_prio = 1;
 	struct tipc_bearer *b;
 	struct tipc_media *m;
-	struct tipc_bearer_names b_names;
 	struct sk_buff *skb;
 	char addr_string[16];
-	u32 bearer_id;
-	u32 with_this_prio;
-	u32 i;
+	int bearer_id = 0;
 	int res = -EINVAL;
+	char *errstr = "";
 
-	if (!tn->own_addr) {
-		pr_warn("Bearer <%s> rejected, not supported in standalone mode\n",
-			name);
-		return -ENOPROTOOPT;
+	if (!self) {
+		errstr = "not supported in standalone mode";
+		res = -ENOPROTOOPT;
+		goto rejected;
 	}
+
 	if (!bearer_name_validate(name, &b_names)) {
-		pr_warn("Bearer <%s> rejected, illegal name\n", name);
-		return -EINVAL;
+		errstr = "illegal name";
+		goto rejected;
 	}
-	if (tipc_addr_domain_valid(disc_domain) &&
-	    (disc_domain != tn->own_addr)) {
-		if (tipc_in_scope(disc_domain, tn->own_addr)) {
-			disc_domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK;
-			res = 0;   /* accept any node in own cluster */
-		} else if (in_own_cluster_exact(net, disc_domain))
-			res = 0;   /* accept specified node in own cluster */
+
+	if (tipc_addr_domain_valid(disc_domain) && disc_domain != self) {
+		if (tipc_in_scope(disc_domain, self)) {
+			/* Accept any node in own cluster */
+			disc_domain = self & TIPC_ZONE_CLUSTER_MASK;
+			res = 0;
+		} else if (in_own_cluster_exact(net, disc_domain)) {
+			/* Accept specified node in own cluster */
+			res = 0;
+		}
 	}
 	if (res) {
-		pr_warn("Bearer <%s> rejected, illegal discovery domain\n",
-			name);
-		return -EINVAL;
+		errstr = "illegal discovery domain";
+		goto rejected;
 	}
-	if ((priority > TIPC_MAX_LINK_PRI) &&
-	    (priority != TIPC_MEDIA_LINK_PRI)) {
-		pr_warn("Bearer <%s> rejected, illegal priority\n", name);
-		return -EINVAL;
+
+	if (prio > TIPC_MAX_LINK_PRI && prio != TIPC_MEDIA_LINK_PRI) {
+		errstr = "illegal priority";
+		goto rejected;
 	}
 
 	m = tipc_media_find(b_names.media_name);
 	if (!m) {
-		pr_warn("Bearer <%s> rejected, media <%s> not registered\n",
-			name, b_names.media_name);
-		return -EINVAL;
+		errstr = "media not registered";
+		goto rejected;
 	}
 
-	if (priority == TIPC_MEDIA_LINK_PRI)
-		priority = m->priority;
+	if (prio == TIPC_MEDIA_LINK_PRI)
+		prio = m->priority;
 
-restart:
-	bearer_id = MAX_BEARERS;
-	with_this_prio = 1;
-	for (i = MAX_BEARERS; i-- != 0; ) {
-		b = rtnl_dereference(tn->bearer_list[i]);
-		if (!b) {
-			bearer_id = i;
-			continue;
-		}
+	/* Check new bearer vs existing ones and find free bearer id if any */
+	while (bearer_id < MAX_BEARERS) {
+		b = rtnl_dereference(tn->bearer_list[bearer_id]);
+		if (!b)
+			break;
 		if (!strcmp(name, b->name)) {
-			pr_warn("Bearer <%s> rejected, already enabled\n",
-				name);
-			return -EINVAL;
+			errstr = "already enabled";
+			goto rejected;
 		}
-		if ((b->priority == priority) &&
-		    (++with_this_prio > 2)) {
-			if (priority-- == 0) {
-				pr_warn("Bearer <%s> rejected, duplicate priority\n",
-					name);
-				return -EINVAL;
-			}
-			pr_warn("Bearer <%s> priority adjustment required %u->%u\n",
-				name, priority + 1, priority);
-			goto restart;
+		bearer_id++;
+		if (b->priority != prio)
+			continue;
+		if (++with_this_prio <= 2)
+			continue;
+		pr_warn("Bearer <%s>: already 2 bearers with priority %u\n",
+			name, prio);
+		if (prio == TIPC_MIN_LINK_PRI) {
+			errstr = "cannot adjust to lower";
+			goto rejected;
 		}
+		pr_warn("Bearer <%s>: trying with adjusted priority\n", name);
+		prio--;
+		bearer_id = 0;
+		with_this_prio = 1;
 	}
+
 	if (bearer_id >= MAX_BEARERS) {
-		pr_warn("Bearer <%s> rejected, bearer limit reached (%u)\n",
-			name, MAX_BEARERS);
-		return -EINVAL;
+		errstr = "max 3 bearers permitted";
+		goto rejected;
 	}
 
 	b = kzalloc(sizeof(*b), GFP_ATOMIC);
@@ -322,10 +324,9 @@ restart:
 	b->media = m;
 	res = m->enable_media(net, b, attr);
 	if (res) {
-		pr_warn("Bearer <%s> rejected, enable failure (%d)\n",
-			name, -res);
 		kfree(b);
-		return -EINVAL;
+		errstr = "failed to enable media";
+		goto rejected;
 	}
 
 	b->identity = bearer_id;
@@ -333,15 +334,15 @@ restart:
 	b->window = m->window;
 	b->domain = disc_domain;
 	b->net_plane = bearer_id + 'A';
-	b->priority = priority;
+	b->priority = prio;
 	test_and_set_bit_lock(0, &b->up);
 
 	res = tipc_disc_create(net, b, &b->bcast_addr, &skb);
 	if (res) {
 		bearer_disable(net, b);
-		pr_warn("Bearer <%s> rejected, discovery object creation failed\n",
-			name);
-		return -EINVAL;
+		kfree(b);
+		errstr = "failed to create discoverer";
+		goto rejected;
 	}
 
 	rcu_assign_pointer(tn->bearer_list[bearer_id], b);
@@ -353,9 +354,12 @@ restart:
 		return -ENOMEM;
 	}
 
-	pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
-		name,
-		tipc_addr_string_fill(addr_string, disc_domain), priority);
+	tipc_addr_string_fill(addr_string, disc_domain);
+	pr_info("Enabled bearer <%s>, discovery scope %s, priority %u\n",
+		name, addr_string, prio);
+	return res;
+rejected:
+	pr_warn("Bearer <%s> rejected, %s\n", name, errstr);
 	return res;
 }
 
-- 
cgit v1.2.3


From b39e465e56ec38ca64b4c0affeb6411eb0ed7267 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:46 +0100
Subject: tipc: some cleanups in the file discover.c

To facilitate the coming changes in the neighbor discovery functionality
we make some renaming and refactoring of that code. The functional changes
in this commit are trivial, e.g., that we move the message sending call in
tipc_disc_timeout() outside the spinlock protected region.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c   |   8 +-
 net/tipc/bearer.h   |   2 +-
 net/tipc/discover.c | 303 +++++++++++++++++++++++++---------------------------
 net/tipc/discover.h |   8 +-
 4 files changed, 155 insertions(+), 166 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index e18cb271b005..76340b9e4851 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -210,7 +210,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest)
 	rcu_read_lock();
 	b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
 	if (b)
-		tipc_disc_add_dest(b->link_req);
+		tipc_disc_add_dest(b->disc);
 	rcu_read_unlock();
 }
 
@@ -222,7 +222,7 @@ void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest)
 	rcu_read_lock();
 	b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
 	if (b)
-		tipc_disc_remove_dest(b->link_req);
+		tipc_disc_remove_dest(b->disc);
 	rcu_read_unlock();
 }
 
@@ -389,8 +389,8 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
 	tipc_node_delete_links(net, bearer_id);
 	b->media->disable_media(b);
 	RCU_INIT_POINTER(b->media_ptr, NULL);
-	if (b->link_req)
-		tipc_disc_delete(b->link_req);
+	if (b->disc)
+		tipc_disc_delete(b->disc);
 	RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL);
 	kfree_rcu(b, rcu);
 	tipc_mon_delete(net, bearer_id);
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index a53613d95bc9..6efcee63a381 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -159,7 +159,7 @@ struct tipc_bearer {
 	u32 tolerance;
 	u32 domain;
 	u32 identity;
-	struct tipc_link_req *link_req;
+	struct tipc_discoverer *disc;
 	char net_plane;
 	unsigned long up;
 };
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 92e4828c6b09..09f75558d353 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -39,34 +39,34 @@
 #include "discover.h"
 
 /* min delay during bearer start up */
-#define TIPC_LINK_REQ_INIT	msecs_to_jiffies(125)
+#define TIPC_DISC_INIT	msecs_to_jiffies(125)
 /* max delay if bearer has no links */
-#define TIPC_LINK_REQ_FAST	msecs_to_jiffies(1000)
+#define TIPC_DISC_FAST	msecs_to_jiffies(1000)
 /* max delay if bearer has links */
-#define TIPC_LINK_REQ_SLOW	msecs_to_jiffies(60000)
+#define TIPC_DISC_SLOW	msecs_to_jiffies(60000)
 /* indicates no timer in use */
-#define TIPC_LINK_REQ_INACTIVE	0xffffffff
+#define TIPC_DISC_INACTIVE	0xffffffff
 
 /**
- * struct tipc_link_req - information about an ongoing link setup request
+ * struct tipc_discoverer - information about an ongoing link setup request
  * @bearer_id: identity of bearer issuing requests
  * @net: network namespace instance
  * @dest: destination address for request messages
  * @domain: network domain to which links can be established
  * @num_nodes: number of nodes currently discovered (i.e. with an active link)
  * @lock: spinlock for controlling access to requests
- * @buf: request message to be (repeatedly) sent
+ * @skb: request message to be (repeatedly) sent
  * @timer: timer governing period between requests
  * @timer_intv: current interval between requests (in ms)
  */
-struct tipc_link_req {
+struct tipc_discoverer {
 	u32 bearer_id;
 	struct tipc_media_addr dest;
 	struct net *net;
 	u32 domain;
 	int num_nodes;
 	spinlock_t lock;
-	struct sk_buff *buf;
+	struct sk_buff *skb;
 	struct timer_list timer;
 	unsigned long timer_intv;
 };
@@ -77,22 +77,35 @@ struct tipc_link_req {
  * @type: message type (request or response)
  * @b: ptr to bearer issuing message
  */
-static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type,
-			       struct tipc_bearer *b)
+static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb,
+			       u32 mtyp, struct tipc_bearer *b)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct tipc_msg *msg;
+	struct tipc_net *tn = tipc_net(net);
 	u32 dest_domain = b->domain;
+	struct tipc_msg *hdr;
 
-	msg = buf_msg(buf);
-	tipc_msg_init(tn->own_addr, msg, LINK_CONFIG, type,
+	hdr = buf_msg(skb);
+	tipc_msg_init(tn->own_addr, hdr, LINK_CONFIG, mtyp,
 		      MAX_H_SIZE, dest_domain);
-	msg_set_non_seq(msg, 1);
-	msg_set_node_sig(msg, tn->random);
-	msg_set_node_capabilities(msg, TIPC_NODE_CAPABILITIES);
-	msg_set_dest_domain(msg, dest_domain);
-	msg_set_bc_netid(msg, tn->net_id);
-	b->media->addr2msg(msg_media_addr(msg), &b->addr);
+	msg_set_non_seq(hdr, 1);
+	msg_set_node_sig(hdr, tn->random);
+	msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES);
+	msg_set_dest_domain(hdr, dest_domain);
+	msg_set_bc_netid(hdr, tn->net_id);
+	b->media->addr2msg(msg_media_addr(hdr), &b->addr);
+}
+
+static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src,
+			       struct tipc_media_addr *maddr,
+			       struct tipc_bearer *b)
+{
+	struct sk_buff *skb;
+
+	skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
+	if (!skb)
+		return;
+	tipc_disc_init_msg(net, skb, mtyp, b);
+	tipc_bearer_xmit_skb(net, b->identity, skb, maddr);
 }
 
 /**
@@ -116,149 +129,123 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
 
 /**
  * tipc_disc_rcv - handle incoming discovery message (request or response)
- * @net: the applicable net namespace
- * @buf: buffer containing message
- * @bearer: bearer that message arrived on
+ * @net: applicable net namespace
+ * @skb: buffer containing message
+ * @b: bearer that message arrived on
  */
 void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
-		   struct tipc_bearer *bearer)
+		   struct tipc_bearer *b)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct tipc_media_addr maddr;
-	struct sk_buff *rskb;
+	struct tipc_net *tn = tipc_net(net);
 	struct tipc_msg *hdr = buf_msg(skb);
-	u32 ddom = msg_dest_domain(hdr);
-	u32 onode = msg_prevnode(hdr);
+	u16 caps = msg_node_capabilities(hdr);
+	u32 signature = msg_node_sig(hdr);
+	u32 dst = msg_dest_domain(hdr);
 	u32 net_id = msg_bc_netid(hdr);
+	u32 self = tipc_own_addr(net);
+	struct tipc_media_addr maddr;
+	u32 src = msg_prevnode(hdr);
 	u32 mtyp = msg_type(hdr);
-	u32 signature = msg_node_sig(hdr);
-	u16 caps = msg_node_capabilities(hdr);
-	bool respond = false;
 	bool dupl_addr = false;
+	bool respond = false;
 	int err;
 
-	err = bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr));
+	err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr));
 	kfree_skb(skb);
-	if (err)
+	if (err || maddr.broadcast) {
+		pr_warn_ratelimited("Rcv corrupt discovery message\n");
 		return;
-
-	/* Ensure message from node is valid and communication is permitted */
-	if (net_id != tn->net_id)
+	}
+	/* Ignore discovery messages from own node */
+	if (!memcmp(&maddr, &b->addr, sizeof(maddr)))
 		return;
-	if (maddr.broadcast)
+	if (net_id != tn->net_id)
 		return;
-	if (!tipc_addr_domain_valid(ddom))
+	if (!tipc_addr_domain_valid(dst))
 		return;
-	if (!tipc_addr_node_valid(onode))
+	if (!tipc_addr_node_valid(src))
 		return;
-
-	if (in_own_node(net, onode)) {
-		if (memcmp(&maddr, &bearer->addr, sizeof(maddr)))
-			disc_dupl_alert(bearer, tn->own_addr, &maddr);
+	if (in_own_node(net, src)) {
+		disc_dupl_alert(b, self, &maddr);
 		return;
 	}
-	if (!tipc_in_scope(ddom, tn->own_addr))
+	if (!tipc_in_scope(dst, self))
 		return;
-	if (!tipc_in_scope(bearer->domain, onode))
+	if (!tipc_in_scope(b->domain, src))
 		return;
-
-	tipc_node_check_dest(net, onode, bearer, caps, signature,
+	tipc_node_check_dest(net, src, b, caps, signature,
 			     &maddr, &respond, &dupl_addr);
 	if (dupl_addr)
-		disc_dupl_alert(bearer, onode, &maddr);
-
-	/* Send response, if necessary */
-	if (respond && (mtyp == DSC_REQ_MSG)) {
-		rskb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
-		if (!rskb)
-			return;
-		tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer);
-		tipc_bearer_xmit_skb(net, bearer->identity, rskb, &maddr);
-	}
+		disc_dupl_alert(b, src, &maddr);
+	if (!respond)
+		return;
+	if (mtyp != DSC_REQ_MSG)
+		return;
+	tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, &maddr, b);
 }
 
-/**
- * disc_update - update frequency of periodic link setup requests
- * @req: ptr to link request structure
- *
- * Reinitiates discovery process if discovery object has no associated nodes
- * and is either not currently searching or is searching at a slow rate
+/* tipc_disc_add_dest - increment set of discovered nodes
  */
-static void disc_update(struct tipc_link_req *req)
+void tipc_disc_add_dest(struct tipc_discoverer *d)
 {
-	if (!req->num_nodes) {
-		if ((req->timer_intv == TIPC_LINK_REQ_INACTIVE) ||
-		    (req->timer_intv > TIPC_LINK_REQ_FAST)) {
-			req->timer_intv = TIPC_LINK_REQ_INIT;
-			mod_timer(&req->timer, jiffies + req->timer_intv);
-		}
-	}
+	spin_lock_bh(&d->lock);
+	d->num_nodes++;
+	spin_unlock_bh(&d->lock);
 }
 
-/**
- * tipc_disc_add_dest - increment set of discovered nodes
- * @req: ptr to link request structure
+/* tipc_disc_remove_dest - decrement set of discovered nodes
  */
-void tipc_disc_add_dest(struct tipc_link_req *req)
+void tipc_disc_remove_dest(struct tipc_discoverer *d)
 {
-	spin_lock_bh(&req->lock);
-	req->num_nodes++;
-	spin_unlock_bh(&req->lock);
-}
+	int intv, num;
 
-/**
- * tipc_disc_remove_dest - decrement set of discovered nodes
- * @req: ptr to link request structure
- */
-void tipc_disc_remove_dest(struct tipc_link_req *req)
-{
-	spin_lock_bh(&req->lock);
-	req->num_nodes--;
-	disc_update(req);
-	spin_unlock_bh(&req->lock);
+	spin_lock_bh(&d->lock);
+	d->num_nodes--;
+	num = d->num_nodes;
+	intv = d->timer_intv;
+	if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST))  {
+		d->timer_intv = TIPC_DISC_INIT;
+		mod_timer(&d->timer, jiffies + d->timer_intv);
+	}
+	spin_unlock_bh(&d->lock);
 }
 
-/**
- * disc_timeout - send a periodic link setup request
- * @data: ptr to link request structure
- *
+/* tipc_disc_timeout - send a periodic link setup request
  * Called whenever a link setup request timer associated with a bearer expires.
+ * - Keep doubling time between sent request until limit is reached;
+ * - Hold at fast polling rate if we don't have any associated nodes
+ * - Otherwise hold at slow polling rate
  */
-static void disc_timeout(struct timer_list *t)
+static void tipc_disc_timeout(struct timer_list *t)
 {
-	struct tipc_link_req *req = from_timer(req, t, timer);
-	struct sk_buff *skb;
-	int max_delay;
+	struct tipc_discoverer *d = from_timer(d, t, timer);
+	struct tipc_media_addr maddr;
+	struct sk_buff *skb = NULL;
+	struct net *net;
+	u32 bearer_id;
 
-	spin_lock_bh(&req->lock);
+	spin_lock_bh(&d->lock);
 
 	/* Stop searching if only desired node has been found */
-	if (tipc_node(req->domain) && req->num_nodes) {
-		req->timer_intv = TIPC_LINK_REQ_INACTIVE;
+	if (tipc_node(d->domain) && d->num_nodes) {
+		d->timer_intv = TIPC_DISC_INACTIVE;
 		goto exit;
 	}
-
-	/*
-	 * Send discovery message, then update discovery timer
-	 *
-	 * Keep doubling time between requests until limit is reached;
-	 * hold at fast polling rate if don't have any associated nodes,
-	 * otherwise hold at slow polling rate
-	 */
-	skb = skb_clone(req->buf, GFP_ATOMIC);
-	if (skb)
-		tipc_bearer_xmit_skb(req->net, req->bearer_id, skb, &req->dest);
-	req->timer_intv *= 2;
-	if (req->num_nodes)
-		max_delay = TIPC_LINK_REQ_SLOW;
-	else
-		max_delay = TIPC_LINK_REQ_FAST;
-	if (req->timer_intv > max_delay)
-		req->timer_intv = max_delay;
-
-	mod_timer(&req->timer, jiffies + req->timer_intv);
+	/* Adjust timeout interval according to discovery phase */
+	d->timer_intv *= 2;
+	if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW)
+		d->timer_intv = TIPC_DISC_SLOW;
+	else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST)
+		d->timer_intv = TIPC_DISC_FAST;
+	mod_timer(&d->timer, jiffies + d->timer_intv);
+	memcpy(&maddr, &d->dest, sizeof(maddr));
+	skb = skb_clone(d->skb, GFP_ATOMIC);
+	net = d->net;
+	bearer_id = d->bearer_id;
 exit:
-	spin_unlock_bh(&req->lock);
+	spin_unlock_bh(&d->lock);
+	if (skb)
+		tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr);
 }
 
 /**
@@ -273,41 +260,41 @@ exit:
 int tipc_disc_create(struct net *net, struct tipc_bearer *b,
 		     struct tipc_media_addr *dest, struct sk_buff **skb)
 {
-	struct tipc_link_req *req;
+	struct tipc_discoverer *d;
 
-	req = kmalloc(sizeof(*req), GFP_ATOMIC);
-	if (!req)
+	d = kmalloc(sizeof(*d), GFP_ATOMIC);
+	if (!d)
 		return -ENOMEM;
-	req->buf = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
-	if (!req->buf) {
-		kfree(req);
+	d->skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
+	if (!d->skb) {
+		kfree(d);
 		return -ENOMEM;
 	}
 
-	tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b);
-	memcpy(&req->dest, dest, sizeof(*dest));
-	req->net = net;
-	req->bearer_id = b->identity;
-	req->domain = b->domain;
-	req->num_nodes = 0;
-	req->timer_intv = TIPC_LINK_REQ_INIT;
-	spin_lock_init(&req->lock);
-	timer_setup(&req->timer, disc_timeout, 0);
-	mod_timer(&req->timer, jiffies + req->timer_intv);
-	b->link_req = req;
-	*skb = skb_clone(req->buf, GFP_ATOMIC);
+	tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b);
+	memcpy(&d->dest, dest, sizeof(*dest));
+	d->net = net;
+	d->bearer_id = b->identity;
+	d->domain = b->domain;
+	d->num_nodes = 0;
+	d->timer_intv = TIPC_DISC_INIT;
+	spin_lock_init(&d->lock);
+	timer_setup(&d->timer, tipc_disc_timeout, 0);
+	mod_timer(&d->timer, jiffies + d->timer_intv);
+	b->disc = d;
+	*skb = skb_clone(d->skb, GFP_ATOMIC);
 	return 0;
 }
 
 /**
  * tipc_disc_delete - destroy object sending periodic link setup requests
- * @req: ptr to link request structure
+ * @d: ptr to link duest structure
  */
-void tipc_disc_delete(struct tipc_link_req *req)
+void tipc_disc_delete(struct tipc_discoverer *d)
 {
-	del_timer_sync(&req->timer);
-	kfree_skb(req->buf);
-	kfree(req);
+	del_timer_sync(&d->timer);
+	kfree_skb(d->skb);
+	kfree(d);
 }
 
 /**
@@ -318,19 +305,21 @@ void tipc_disc_delete(struct tipc_link_req *req)
  */
 void tipc_disc_reset(struct net *net, struct tipc_bearer *b)
 {
-	struct tipc_link_req *req = b->link_req;
+	struct tipc_discoverer *d = b->disc;
+	struct tipc_media_addr maddr;
 	struct sk_buff *skb;
 
-	spin_lock_bh(&req->lock);
-	tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b);
-	req->net = net;
-	req->bearer_id = b->identity;
-	req->domain = b->domain;
-	req->num_nodes = 0;
-	req->timer_intv = TIPC_LINK_REQ_INIT;
-	mod_timer(&req->timer, jiffies + req->timer_intv);
-	skb = skb_clone(req->buf, GFP_ATOMIC);
+	spin_lock_bh(&d->lock);
+	tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b);
+	d->net = net;
+	d->bearer_id = b->identity;
+	d->domain = b->domain;
+	d->num_nodes = 0;
+	d->timer_intv = TIPC_DISC_INIT;
+	memcpy(&maddr, &d->dest, sizeof(maddr));
+	mod_timer(&d->timer, jiffies + d->timer_intv);
+	skb = skb_clone(d->skb, GFP_ATOMIC);
+	spin_unlock_bh(&d->lock);
 	if (skb)
-		tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest);
-	spin_unlock_bh(&req->lock);
+		tipc_bearer_xmit_skb(net, b->identity, skb, &maddr);
 }
diff --git a/net/tipc/discover.h b/net/tipc/discover.h
index b80a335389c0..521d96c41dfd 100644
--- a/net/tipc/discover.h
+++ b/net/tipc/discover.h
@@ -37,14 +37,14 @@
 #ifndef _TIPC_DISCOVER_H
 #define _TIPC_DISCOVER_H
 
-struct tipc_link_req;
+struct tipc_discoverer;
 
 int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
 		     struct tipc_media_addr *dest, struct sk_buff **skb);
-void tipc_disc_delete(struct tipc_link_req *req);
+void tipc_disc_delete(struct tipc_discoverer *req);
 void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr);
-void tipc_disc_add_dest(struct tipc_link_req *req);
-void tipc_disc_remove_dest(struct tipc_link_req *req);
+void tipc_disc_add_dest(struct tipc_discoverer *req);
+void tipc_disc_remove_dest(struct tipc_discoverer *req);
 void tipc_disc_rcv(struct net *net, struct sk_buff *buf,
 		   struct tipc_bearer *b_ptr);
 
-- 
cgit v1.2.3


From 2026364149db36c6a2c0c8cae8362fe9a7f954dd Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:47 +0100
Subject: tipc: remove restrictions on node address values

Nominally, TIPC organizes network nodes into a three-level network
hierarchy consisting of the levels 'zone', 'cluster' and 'node'. This
hierarchy is reflected in the node address format, - it is sub-divided
into an 8-bit zone id, and 12 bit cluster id, and a 12-bit node id.

However, the 'zone' and 'cluster' levels have in reality never been
fully implemented,and never will be. The result of this has been
that the first 20 bits the node identity structure have been wasted,
and the usable node identity range within a cluster has been limited
to 12 bits. This is starting to become a problem.

In the following commits, we will need to be able to connect between
nodes which are using the whole 32-bit value space of the node address.
We therefore remove the restrictions on which values can be assigned
to node identity, -it is from now on only a 32-bit integer with no
assumed internal structure.

Isolation between clusters is now achieved only by setting different
values for the 'network id' field used during neighbor discovery, in
practice leading to the latter becoming the new cluster identity.

The rules for accepting discovery requests/responses from neighboring
nodes now become:

- If the user is using legacy address format on both peers, reception
  of discovery messages is subject to the legacy lookup domain check
  in addition to the cluster id check.

- Otherwise, the discovery request/response is always accepted, provided
  both peers have the same network id.

This secures backwards compatibility for users who have been using zone
or cluster identities as cluster separators, instead of the intended
'network id'.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/addr.c     | 50 +-------------------------------------------------
 net/tipc/addr.h     | 11 -----------
 net/tipc/bearer.c   | 27 ++++-----------------------
 net/tipc/discover.c | 15 +++++++--------
 net/tipc/link.c     |  6 ++----
 net/tipc/net.c      |  4 ++--
 net/tipc/node.c     |  8 ++------
 net/tipc/node.h     |  5 +++--
 8 files changed, 21 insertions(+), 105 deletions(-)

(limited to 'net')

diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 97cd857d7f43..dfc31a730ca5 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -38,21 +38,6 @@
 #include "addr.h"
 #include "core.h"
 
-/**
- * in_own_cluster - test for cluster inclusion; <0.0.0> always matches
- */
-int in_own_cluster(struct net *net, u32 addr)
-{
-	return in_own_cluster_exact(net, addr) || !addr;
-}
-
-int in_own_cluster_exact(struct net *net, u32 addr)
-{
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-
-	return !((addr ^ tn->own_addr) >> 12);
-}
-
 /**
  * in_own_node - test for node inclusion; <0.0.0> always matches
  */
@@ -63,46 +48,13 @@ int in_own_node(struct net *net, u32 addr)
 	return (addr == tn->own_addr) || !addr;
 }
 
-/**
- * tipc_addr_domain_valid - validates a network domain address
- *
- * Accepts <Z.C.N>, <Z.C.0>, <Z.0.0>, and <0.0.0>,
- * where Z, C, and N are non-zero.
- *
- * Returns 1 if domain address is valid, otherwise 0
- */
-int tipc_addr_domain_valid(u32 addr)
-{
-	u32 n = tipc_node(addr);
-	u32 c = tipc_cluster(addr);
-	u32 z = tipc_zone(addr);
-
-	if (n && (!z || !c))
-		return 0;
-	if (c && !z)
-		return 0;
-	return 1;
-}
-
-/**
- * tipc_addr_node_valid - validates a proposed network address for this node
- *
- * Accepts <Z.C.N>, where Z, C, and N are non-zero.
- *
- * Returns 1 if address can be used, otherwise 0
- */
-int tipc_addr_node_valid(u32 addr)
-{
-	return tipc_addr_domain_valid(addr) && tipc_node(addr);
-}
-
 int tipc_in_scope(u32 domain, u32 addr)
 {
 	if (!domain || (domain == addr))
 		return 1;
 	if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */
 		return 1;
-	if (domain == tipc_zone_mask(addr)) /* domain <Z.0.0> */
+	if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */
 		return 1;
 	return 0;
 }
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 2ecf5a5d40dd..5ffde51b0e68 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -50,11 +50,6 @@ static inline u32 tipc_own_addr(struct net *net)
 	return tn->own_addr;
 }
 
-static inline u32 tipc_zone_mask(u32 addr)
-{
-	return addr & TIPC_ZONE_MASK;
-}
-
 static inline u32 tipc_cluster_mask(u32 addr)
 {
 	return addr & TIPC_ZONE_CLUSTER_MASK;
@@ -71,14 +66,8 @@ static inline int tipc_scope2node(struct net *net, int sc)
 }
 
 u32 tipc_own_addr(struct net *net);
-int in_own_cluster(struct net *net, u32 addr);
-int in_own_cluster_exact(struct net *net, u32 addr);
 int in_own_node(struct net *net, u32 addr);
-u32 addr_domain(struct net *net, u32 sc);
-int tipc_addr_domain_valid(u32);
-int tipc_addr_node_valid(u32 addr);
 int tipc_in_scope(u32 domain, u32 addr);
-int tipc_addr_scope(u32 domain);
 char *tipc_addr_string_fill(char *string, u32 addr);
 
 #endif
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 76340b9e4851..a71f31879cb3 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -240,7 +240,6 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 	struct tipc_bearer *b;
 	struct tipc_media *m;
 	struct sk_buff *skb;
-	char addr_string[16];
 	int bearer_id = 0;
 	int res = -EINVAL;
 	char *errstr = "";
@@ -256,21 +255,6 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 		goto rejected;
 	}
 
-	if (tipc_addr_domain_valid(disc_domain) && disc_domain != self) {
-		if (tipc_in_scope(disc_domain, self)) {
-			/* Accept any node in own cluster */
-			disc_domain = self & TIPC_ZONE_CLUSTER_MASK;
-			res = 0;
-		} else if (in_own_cluster_exact(net, disc_domain)) {
-			/* Accept specified node in own cluster */
-			res = 0;
-		}
-	}
-	if (res) {
-		errstr = "illegal discovery domain";
-		goto rejected;
-	}
-
 	if (prio > TIPC_MAX_LINK_PRI && prio != TIPC_MEDIA_LINK_PRI) {
 		errstr = "illegal priority";
 		goto rejected;
@@ -354,12 +338,11 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 		return -ENOMEM;
 	}
 
-	tipc_addr_string_fill(addr_string, disc_domain);
-	pr_info("Enabled bearer <%s>, discovery scope %s, priority %u\n",
-		name, addr_string, prio);
+	pr_info("Enabled bearer <%s>, priority %u\n", name, prio);
+
 	return res;
 rejected:
-	pr_warn("Bearer <%s> rejected, %s\n", name, errstr);
+	pr_warn("Enabling of bearer <%s> rejected, %s\n", name, errstr);
 	return res;
 }
 
@@ -865,12 +848,10 @@ int __tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info)
 	char *bearer;
 	struct nlattr *attrs[TIPC_NLA_BEARER_MAX + 1];
 	struct net *net = sock_net(skb->sk);
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	u32 domain;
+	u32 domain = 0;
 	u32 prio;
 
 	prio = TIPC_MEDIA_LINK_PRI;
-	domain = tn->own_addr & TIPC_ZONE_CLUSTER_MASK;
 
 	if (!info->attrs[TIPC_NLA_BEARER])
 		return -EINVAL;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 09f75558d353..669af125b3de 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -161,18 +161,17 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
 		return;
 	if (net_id != tn->net_id)
 		return;
-	if (!tipc_addr_domain_valid(dst))
-		return;
-	if (!tipc_addr_node_valid(src))
-		return;
 	if (in_own_node(net, src)) {
 		disc_dupl_alert(b, self, &maddr);
 		return;
 	}
-	if (!tipc_in_scope(dst, self))
-		return;
-	if (!tipc_in_scope(b->domain, src))
-		return;
+	/* Domain filter only works if both peers use legacy address format */
+	if (b->domain) {
+		if (!tipc_in_scope(dst, self))
+			return;
+		if (!tipc_in_scope(b->domain, src))
+			return;
+	}
 	tipc_node_check_dest(net, src, b, caps, signature,
 			     &maddr, &respond, &dupl_addr);
 	if (dupl_addr)
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 3c230466804d..86fde005ea47 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -434,7 +434,7 @@ char *tipc_link_name(struct tipc_link *l)
  */
 bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 		      int tolerance, char net_plane, u32 mtu, int priority,
-		      int window, u32 session, u32 ownnode, u32 peer,
+		      int window, u32 session, u32 self, u32 peer,
 		      u16 peer_caps,
 		      struct tipc_link *bc_sndlink,
 		      struct tipc_link *bc_rcvlink,
@@ -451,9 +451,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 	l->session = session;
 
 	/* Note: peer i/f name is completed by reset/activate message */
-	sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
-		tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode),
-		if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
+	sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer);
 	strcpy(l->if_name, if_name);
 	l->addr = peer;
 	l->peer_caps = peer_caps;
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 5c4c4405b78e..a074f285e6ea 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -121,7 +121,7 @@ int tipc_net_start(struct net *net, u32 addr)
 			     TIPC_CLUSTER_SCOPE, 0, tn->own_addr);
 
 	pr_info("Started in network mode\n");
-	pr_info("Own node address %s, network identity %u\n",
+	pr_info("Own node address %s, cluster identity %u\n",
 		tipc_addr_string_fill(addr_string, tn->own_addr),
 		tn->net_id);
 	return 0;
@@ -238,7 +238,7 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 			return -EPERM;
 
 		addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
-		if (!tipc_addr_node_valid(addr))
+		if (!addr)
 			return -EINVAL;
 
 		tipc_net_start(net, addr);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 389193d7cf67..8a4b04933ecc 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -233,9 +233,6 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr)
 	struct tipc_node *node;
 	unsigned int thash = tipc_hashfn(addr);
 
-	if (unlikely(!in_own_cluster_exact(net, addr)))
-		return NULL;
-
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) {
 		if (node->addr != addr)
@@ -836,10 +833,9 @@ void tipc_node_check_dest(struct net *net, u32 onode,
 
 	/* Now create new link if not already existing */
 	if (!l) {
-		if (n->link_cnt == 2) {
-			pr_warn("Cannot establish 3rd link to %x\n", n->addr);
+		if (n->link_cnt == 2)
 			goto exit;
-		}
+
 		if_name = strchr(b->name, ':') + 1;
 		if (!tipc_link_create(net, if_name, b->identity, b->tolerance,
 				      b->net_plane, b->mtu, b->priority,
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 4ce5e3a185c0..5fb38cf0bb5c 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -49,13 +49,14 @@ enum {
 	TIPC_BCAST_STATE_NACK = (1 << 2),
 	TIPC_BLOCK_FLOWCTL    = (1 << 3),
 	TIPC_BCAST_RCAST      = (1 << 4),
-	TIPC_MCAST_GROUPS     = (1 << 5)
+	TIPC_NODE_ID32        = (1 << 5)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
 				TIPC_BCAST_STATE_NACK | \
 				TIPC_BCAST_RCAST | \
-				TIPC_BLOCK_FLOWCTL)
+				TIPC_BLOCK_FLOWCTL | \
+				TIPC_NODE_ID32)
 #define INVALID_BEARER_ID -1
 
 void tipc_node_stop(struct net *net);
-- 
cgit v1.2.3


From b89afb116ca2830cc982624f93e888860868a84b Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:48 +0100
Subject: tipc: allow closest-first lookup algorithm when legacy address is
 configured

The removal of an internal structure of the node address has an unwanted
side effect.
- Currently, if a user is sending an anycast message with destination
  domain 0, the tipc_namebl_translate() function will use the 'closest-
  first' algorithm to first look for a node local destination, and only
  when no such is found, will it resort to the cluster global 'round-
  robin' lookup algorithm.
- Current users can get around this, and enforce unconditional use of
  global round-robin by indicating a destination as Z.0.0 or Z.C.0.
- This option disappears when we make the node address flat, since the
  lookup algorithm has no way of recognizing this case. So, as long as
  there are node local destinations, the algorithm will always select
  one of those, and there is nothing the sender can do to change this.

We solve this by eliminating the 'closest-first' option, which was never
a good idea anyway, for non-legacy users, but only for those. To
distinguish between legacy users and non-legacy users we introduce a new
flag 'legacy_addr_format' in struct tipc_core, to be set when the user
configures a legacy-style Z.C.N node address. Hence, when a legacy user
indicates a zero lookup domain 'closest-first' is selected, and in all
other cases we use 'round-robin'.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/addr.c       | 12 +++++++-----
 net/tipc/addr.h       |  2 +-
 net/tipc/core.h       |  3 ++-
 net/tipc/discover.c   | 13 ++++++-------
 net/tipc/name_table.c |  8 +++++---
 net/tipc/net.c        |  2 +-
 6 files changed, 22 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index dfc31a730ca5..19987994704f 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -48,15 +48,17 @@ int in_own_node(struct net *net, u32 addr)
 	return (addr == tn->own_addr) || !addr;
 }
 
-int tipc_in_scope(u32 domain, u32 addr)
+bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr)
 {
 	if (!domain || (domain == addr))
-		return 1;
+		return true;
+	if (!legacy_format)
+		return false;
 	if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */
-		return 1;
+		return true;
 	if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */
-		return 1;
-	return 0;
+		return true;
+	return false;
 }
 
 char *tipc_addr_string_fill(char *string, u32 addr)
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 5ffde51b0e68..97bdc0e1a369 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -67,7 +67,7 @@ static inline int tipc_scope2node(struct net *net, int sc)
 
 u32 tipc_own_addr(struct net *net);
 int in_own_node(struct net *net, u32 addr);
-int tipc_in_scope(u32 domain, u32 addr);
+bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr);
 char *tipc_addr_string_fill(char *string, u32 addr);
 
 #endif
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 347f850dc872..bd2b112680a4 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -1,7 +1,7 @@
 /*
  * net/tipc/core.h: Include file for TIPC global declarations
  *
- * Copyright (c) 2005-2006, 2013 Ericsson AB
+ * Copyright (c) 2005-2006, 2013-2018 Ericsson AB
  * Copyright (c) 2005-2007, 2010-2013, Wind River Systems
  * All rights reserved.
  *
@@ -81,6 +81,7 @@ struct tipc_net {
 	u32 own_addr;
 	int net_id;
 	int random;
+	bool legacy_addr_format;
 
 	/* Node table and node list */
 	spinlock_t node_list_lock;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 669af125b3de..82556e19222d 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -139,6 +139,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
 	struct tipc_net *tn = tipc_net(net);
 	struct tipc_msg *hdr = buf_msg(skb);
 	u16 caps = msg_node_capabilities(hdr);
+	bool legacy = tn->legacy_addr_format;
 	u32 signature = msg_node_sig(hdr);
 	u32 dst = msg_dest_domain(hdr);
 	u32 net_id = msg_bc_netid(hdr);
@@ -165,13 +166,11 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
 		disc_dupl_alert(b, self, &maddr);
 		return;
 	}
-	/* Domain filter only works if both peers use legacy address format */
-	if (b->domain) {
-		if (!tipc_in_scope(dst, self))
-			return;
-		if (!tipc_in_scope(b->domain, src))
-			return;
-	}
+	if (!tipc_in_scope(legacy, dst, self))
+		return;
+	if (!tipc_in_scope(legacy, b->domain, src))
+		return;
+
 	tipc_node_check_dest(net, src, b, caps, signature,
 			     &maddr, &respond, &dupl_addr);
 	if (dupl_addr)
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index bbbfc0702634..7478acb39096 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -499,7 +499,9 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 			   u32 *destnode)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(net);
+	bool legacy = tn->legacy_addr_format;
+	u32 self = tipc_own_addr(net);
 	struct sub_seq *sseq;
 	struct name_info *info;
 	struct publication *publ;
@@ -507,7 +509,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 	u32 port = 0;
 	u32 node = 0;
 
-	if (!tipc_in_scope(*destnode, tn->own_addr))
+	if (!tipc_in_scope(legacy, *destnode, self))
 		return 0;
 
 	rcu_read_lock();
@@ -521,7 +523,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 	info = sseq->info;
 
 	/* Closest-First Algorithm */
-	if (likely(!*destnode)) {
+	if (legacy && !*destnode) {
 		if (!list_empty(&info->local_publ)) {
 			publ = list_first_entry(&info->local_publ,
 						struct publication,
diff --git a/net/tipc/net.c b/net/tipc/net.c
index a074f285e6ea..eb0d7a352e3f 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -240,7 +240,7 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 		addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
 		if (!addr)
 			return -EINVAL;
-
+		tn->legacy_addr_format = true;
 		tipc_net_start(net, addr);
 	}
 
-- 
cgit v1.2.3


From 23fd3eace088ab1872ee59c19191a119ec779ac9 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:49 +0100
Subject: tipc: remove direct accesses to own_addr field in struct tipc_net

As a preparation to changing the addressing structure of TIPC we replace
all direct accesses to the tipc_net::own_addr field with the function
dedicated for this, tipc_own_addr().

There are no changes to program logics in this commit.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/addr.c       |  6 +++---
 net/tipc/addr.h       |  2 +-
 net/tipc/discover.c   |  3 ++-
 net/tipc/link.c       |  9 ++++-----
 net/tipc/name_distr.c | 11 ++++++-----
 net/tipc/name_table.c |  6 +++---
 net/tipc/net.c        | 31 +++++++++++++------------------
 net/tipc/socket.c     | 23 ++++++++++-------------
 8 files changed, 42 insertions(+), 49 deletions(-)

(limited to 'net')

diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 19987994704f..6e06b4d981f1 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -43,9 +43,7 @@
  */
 int in_own_node(struct net *net, u32 addr)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-
-	return (addr == tn->own_addr) || !addr;
+	return addr == tipc_own_addr(net) || !addr;
 }
 
 bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr)
@@ -56,6 +54,8 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr)
 		return false;
 	if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */
 		return true;
+	if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */
+		return true;
 	if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */
 		return true;
 	return false;
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 97bdc0e1a369..6b48f0dc0205 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -45,7 +45,7 @@
 
 static inline u32 tipc_own_addr(struct net *net)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(net);
 
 	return tn->own_addr;
 }
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 82556e19222d..94d524018ca5 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -81,11 +81,12 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb,
 			       u32 mtyp, struct tipc_bearer *b)
 {
 	struct tipc_net *tn = tipc_net(net);
+	u32 self = tipc_own_addr(net);
 	u32 dest_domain = b->domain;
 	struct tipc_msg *hdr;
 
 	hdr = buf_msg(skb);
-	tipc_msg_init(tn->own_addr, hdr, LINK_CONFIG, mtyp,
+	tipc_msg_init(self, hdr, LINK_CONFIG, mtyp,
 		      MAX_H_SIZE, dest_domain);
 	msg_set_non_seq(hdr, 1);
 	msg_set_node_sig(hdr, tn->random);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 86fde005ea47..4aa56e3bf4fc 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1936,11 +1936,11 @@ msg_full:
 int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
 		       struct tipc_link *link, int nlflags)
 {
-	int err;
-	void *hdr;
+	u32 self = tipc_own_addr(net);
 	struct nlattr *attrs;
 	struct nlattr *prop;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	void *hdr;
+	int err;
 
 	hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
 			  nlflags, TIPC_NL_LINK_GET);
@@ -1953,8 +1953,7 @@ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
 
 	if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, link->name))
 		goto attr_msg_full;
-	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST,
-			tipc_cluster_mask(tn->own_addr)))
+	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, tipc_cluster_mask(self)))
 		goto attr_msg_full;
 	if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu))
 		goto attr_msg_full;
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 28d095a7d8bb..7e571f4f47bc 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -68,14 +68,14 @@ static void publ_to_item(struct distr_item *i, struct publication *p)
 static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
 					 u32 dest)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC);
+	u32 self = tipc_own_addr(net);
 	struct tipc_msg *msg;
 
 	if (buf != NULL) {
 		msg = buf_msg(buf);
-		tipc_msg_init(tn->own_addr, msg, NAME_DISTRIBUTOR, type,
-			      INT_H_SIZE, dest);
+		tipc_msg_init(self, msg, NAME_DISTRIBUTOR,
+			      type, INT_H_SIZE, dest);
 		msg_set_size(msg, INT_H_SIZE + size);
 	}
 	return buf;
@@ -382,13 +382,14 @@ void tipc_named_reinit(struct net *net)
 	struct name_table *nt = tipc_name_table(net);
 	struct tipc_net *tn = tipc_net(net);
 	struct publication *publ;
+	u32 self = tipc_own_addr(net);
 
 	spin_lock_bh(&tn->nametbl_lock);
 
 	list_for_each_entry_rcu(publ, &nt->node_scope, binding_node)
-		publ->node = tn->own_addr;
+		publ->node = self;
 	list_for_each_entry_rcu(publ, &nt->cluster_scope, binding_node)
-		publ->node = tn->own_addr;
+		publ->node = self;
 
 	spin_unlock_bh(&tn->nametbl_lock);
 }
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 7478acb39096..4359605b1bec 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -540,7 +540,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 	}
 
 	/* Round-Robin Algorithm */
-	else if (*destnode == tn->own_addr) {
+	else if (*destnode == tipc_own_addr(net)) {
 		if (list_empty(&info->local_publ))
 			goto no_match;
 		publ = list_first_entry(&info->local_publ, struct publication,
@@ -713,7 +713,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
 	}
 
 	publ = tipc_nametbl_insert_publ(net, type, lower, upper, scope,
-					tn->own_addr, port_ref, key);
+					tipc_own_addr(net), port_ref, key);
 	if (likely(publ)) {
 		tn->nametbl->local_publ_count++;
 		buf = tipc_named_publish(net, publ);
@@ -738,7 +738,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port,
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 
 	spin_lock_bh(&tn->nametbl_lock);
-	publ = tipc_nametbl_remove_publ(net, type, lower, tn->own_addr,
+	publ = tipc_nametbl_remove_publ(net, type, lower, tipc_own_addr(net),
 					port, key);
 	if (likely(publ)) {
 		tn->nametbl->local_publ_count--;
diff --git a/net/tipc/net.c b/net/tipc/net.c
index eb0d7a352e3f..7f140a5308ee 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -106,7 +106,7 @@
 
 int tipc_net_start(struct net *net, u32 addr)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(net);
 	char addr_string[16];
 
 	tn->own_addr = addr;
@@ -117,25 +117,24 @@ int tipc_net_start(struct net *net, u32 addr)
 	tipc_named_reinit(net);
 	tipc_sk_reinit(net);
 
-	tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr,
-			     TIPC_CLUSTER_SCOPE, 0, tn->own_addr);
+	tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
+			     TIPC_CLUSTER_SCOPE, 0, addr);
 
 	pr_info("Started in network mode\n");
 	pr_info("Own node address %s, cluster identity %u\n",
-		tipc_addr_string_fill(addr_string, tn->own_addr),
+		tipc_addr_string_fill(addr_string, addr),
 		tn->net_id);
 	return 0;
 }
 
 void tipc_net_stop(struct net *net)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	u32 self = tipc_own_addr(net);
 
-	if (!tn->own_addr)
+	if (!self)
 		return;
 
-	tipc_nametbl_withdraw(net, TIPC_CFG_SRV, tn->own_addr, 0,
-			      tn->own_addr);
+	tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, 0, self);
 	rtnl_lock();
 	tipc_bearer_stop(net);
 	tipc_node_stop(net);
@@ -202,9 +201,9 @@ out:
 
 int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 {
-	struct net *net = sock_net(skb->sk);
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct nlattr *attrs[TIPC_NLA_NET_MAX + 1];
+	struct net *net = sock_net(skb->sk);
+	struct tipc_net *tn = tipc_net(net);
 	int err;
 
 	if (!info->attrs[TIPC_NLA_NET])
@@ -216,13 +215,13 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 	if (err)
 		return err;
 
+	/* Can't change net id once TIPC has joined a network */
+	if (tipc_own_addr(net))
+		return -EPERM;
+
 	if (attrs[TIPC_NLA_NET_ID]) {
 		u32 val;
 
-		/* Can't change net id once TIPC has joined a network */
-		if (tn->own_addr)
-			return -EPERM;
-
 		val = nla_get_u32(attrs[TIPC_NLA_NET_ID]);
 		if (val < 1 || val > 9999)
 			return -EINVAL;
@@ -233,10 +232,6 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 	if (attrs[TIPC_NLA_NET_ADDR]) {
 		u32 addr;
 
-		/* Can't change net addr once TIPC has joined a network */
-		if (tn->own_addr)
-			return -EPERM;
-
 		addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]);
 		if (!addr)
 			return -EINVAL;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 732ec894f69f..275b666f6231 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -289,10 +289,9 @@ static bool tipc_sk_type_connectionless(struct sock *sk)
 static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
 {
 	struct sock *sk = &tsk->sk;
-	struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id);
+	u32 self = tipc_own_addr(sock_net(sk));
 	u32 peer_port = tsk_peer_port(tsk);
-	u32 orig_node;
-	u32 peer_node;
+	u32 orig_node, peer_node;
 
 	if (unlikely(!tipc_sk_connected(sk)))
 		return false;
@@ -306,10 +305,10 @@ static bool tsk_peer_msg(struct tipc_sock *tsk, struct tipc_msg *msg)
 	if (likely(orig_node == peer_node))
 		return true;
 
-	if (!orig_node && (peer_node == tn->own_addr))
+	if (!orig_node && peer_node == self)
 		return true;
 
-	if (!peer_node && (orig_node == tn->own_addr))
+	if (!peer_node && orig_node == self)
 		return true;
 
 	return false;
@@ -461,8 +460,8 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	/* Ensure tsk is visible before we read own_addr. */
 	smp_mb();
 
-	tipc_msg_init(tn->own_addr, msg, TIPC_LOW_IMPORTANCE, TIPC_NAMED_MSG,
-		      NAMED_H_SIZE, 0);
+	tipc_msg_init(tipc_own_addr(net), msg, TIPC_LOW_IMPORTANCE,
+		      TIPC_NAMED_MSG, NAMED_H_SIZE, 0);
 
 	msg_set_origport(msg, tsk->portid);
 	timer_setup(&sk->sk_timer, tipc_sk_timeout, 0);
@@ -671,7 +670,6 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
 	struct sockaddr_tipc *addr = (struct sockaddr_tipc *)uaddr;
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
-	struct tipc_net *tn = net_generic(sock_net(sock->sk), tipc_net_id);
 
 	memset(addr, 0, sizeof(*addr));
 	if (peer) {
@@ -682,7 +680,7 @@ static int tipc_getname(struct socket *sock, struct sockaddr *uaddr,
 		addr->addr.id.node = tsk_peer_node(tsk);
 	} else {
 		addr->addr.id.ref = tsk->portid;
-		addr->addr.id.node = tn->own_addr;
+		addr->addr.id.node = tipc_own_addr(sock_net(sk));
 	}
 
 	addr->addrtype = TIPC_ADDR_ID;
@@ -2667,8 +2665,8 @@ void tipc_sk_reinit(struct net *net)
 		while ((tsk = rhashtable_walk_next(&iter)) && !IS_ERR(tsk)) {
 			spin_lock_bh(&tsk->sk.sk_lock.slock);
 			msg = &tsk->phdr;
-			msg_set_prevnode(msg, tn->own_addr);
-			msg_set_orignode(msg, tn->own_addr);
+			msg_set_prevnode(msg, tipc_own_addr(net));
+			msg_set_orignode(msg, tipc_own_addr(net));
 			spin_unlock_bh(&tsk->sk.sk_lock.slock);
 		}
 
@@ -3167,11 +3165,10 @@ static int __tipc_nl_add_sk_info(struct sk_buff *skb, struct tipc_sock
 			  *tsk)
 {
 	struct net *net = sock_net(skb->sk);
-	struct tipc_net *tn = tipc_net(net);
 	struct sock *sk = &tsk->sk;
 
 	if (nla_put_u32(skb, TIPC_NLA_SOCK_REF, tsk->portid) ||
-	    nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tn->own_addr))
+	    nla_put_u32(skb, TIPC_NLA_SOCK_ADDR, tipc_own_addr(net)))
 		return -EMSGSIZE;
 
 	if (tipc_sk_connected(sk)) {
-- 
cgit v1.2.3


From d50ccc2d3909fc1b4d40e4af16b026f05dc68707 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:50 +0100
Subject: tipc: add 128-bit node identifier

We add a 128-bit node identity, as an alternative to the currently used
32-bit node address.

For the sake of compatibility and to minimize message header changes
we retain the existing 32-bit address field. When not set explicitly by
the user, this field will be filled with a hash value generated from the
much longer node identity, and be used as a shorthand value for the
latter.

We permit either the address or the identity to be set by configuration,
but not both, so when the address value is set by a legacy user the
corresponding 128-bit node identity is generated based on the that value.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/addr.c       | 81 ++++++++++++++++++++++++++++++++++++++++++---------
 net/tipc/addr.h       | 28 ++++++++++++++----
 net/tipc/core.c       |  4 ++-
 net/tipc/core.h       |  6 +++-
 net/tipc/discover.c   |  4 +--
 net/tipc/link.c       |  6 +++-
 net/tipc/name_distr.c |  6 ++--
 net/tipc/net.c        | 51 ++++++++++++++++++++++----------
 net/tipc/net.h        |  4 +--
 net/tipc/node.c       |  8 ++---
 net/tipc/node.h       |  4 +--
 11 files changed, 146 insertions(+), 56 deletions(-)

(limited to 'net')

diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 6e06b4d981f1..4841e98591d0 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/addr.c: TIPC address utility routines
  *
- * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2000-2006, 2018, Ericsson AB
  * Copyright (c) 2004-2005, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -34,18 +34,9 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <linux/kernel.h>
 #include "addr.h"
 #include "core.h"
 
-/**
- * in_own_node - test for node inclusion; <0.0.0> always matches
- */
-int in_own_node(struct net *net, u32 addr)
-{
-	return addr == tipc_own_addr(net) || !addr;
-}
-
 bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr)
 {
 	if (!domain || (domain == addr))
@@ -61,9 +52,71 @@ bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr)
 	return false;
 }
 
-char *tipc_addr_string_fill(char *string, u32 addr)
+void tipc_set_node_id(struct net *net, u8 *id)
+{
+	struct tipc_net *tn = tipc_net(net);
+	u32 *tmp = (u32 *)id;
+
+	memcpy(tn->node_id, id, NODE_ID_LEN);
+	tipc_nodeid2string(tn->node_id_string, id);
+	tn->node_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
+	pr_info("Own node identity %s, cluster identity %u\n",
+		tipc_own_id_string(net), tn->net_id);
+}
+
+void tipc_set_node_addr(struct net *net, u32 addr)
 {
-	snprintf(string, 16, "<%u.%u.%u>",
-		 tipc_zone(addr), tipc_cluster(addr), tipc_node(addr));
-	return string;
+	struct tipc_net *tn = tipc_net(net);
+	u8 node_id[NODE_ID_LEN] = {0,};
+
+	tn->node_addr = addr;
+	if (!tipc_own_id(net)) {
+		sprintf(node_id, "%x", addr);
+		tipc_set_node_id(net, node_id);
+	}
+	pr_info("32-bit node address hash set to %x\n", addr);
+}
+
+char *tipc_nodeid2string(char *str, u8 *id)
+{
+	int i;
+	u8 c;
+
+	/* Already a string ? */
+	for (i = 0; i < NODE_ID_LEN; i++) {
+		c = id[i];
+		if (c >= '0' && c <= '9')
+			continue;
+		if (c >= 'A' && c <= 'Z')
+			continue;
+		if (c >= 'a' && c <= 'z')
+			continue;
+		if (c == '.')
+			continue;
+		if (c == ':')
+			continue;
+		if (c == '_')
+			continue;
+		if (c == '-')
+			continue;
+		if (c == '@')
+			continue;
+		if (c != 0)
+			break;
+	}
+	if (i == NODE_ID_LEN) {
+		memcpy(str, id, NODE_ID_LEN);
+		str[NODE_ID_LEN] = 0;
+		return str;
+	}
+
+	/* Translate to hex string */
+	for (i = 0; i < NODE_ID_LEN; i++)
+		sprintf(&str[2 * i], "%02x", id[i]);
+
+	/* Strip off trailing zeroes */
+	for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--)
+		str[i] = 0;
+
+	return str;
 }
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 6b48f0dc0205..31bee0ea7b3e 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -1,7 +1,7 @@
 /*
  * net/tipc/addr.h: Include file for TIPC address utility routines
  *
- * Copyright (c) 2000-2006, Ericsson AB
+ * Copyright (c) 2000-2006, 2018, Ericsson AB
  * Copyright (c) 2004-2005, Wind River Systems
  * All rights reserved.
  *
@@ -44,10 +44,22 @@
 #include "core.h"
 
 static inline u32 tipc_own_addr(struct net *net)
+{
+	return tipc_net(net)->node_addr;
+}
+
+static inline u8 *tipc_own_id(struct net *net)
 {
 	struct tipc_net *tn = tipc_net(net);
 
-	return tn->own_addr;
+	if (!strlen(tn->node_id_string))
+		return NULL;
+	return tn->node_id;
+}
+
+static inline char *tipc_own_id_string(struct net *net)
+{
+	return tipc_net(net)->node_id_string;
 }
 
 static inline u32 tipc_cluster_mask(u32 addr)
@@ -65,9 +77,15 @@ static inline int tipc_scope2node(struct net *net, int sc)
 	return sc != TIPC_NODE_SCOPE ? 0 : tipc_own_addr(net);
 }
 
-u32 tipc_own_addr(struct net *net);
-int in_own_node(struct net *net, u32 addr);
+static inline int in_own_node(struct net *net, u32 addr)
+{
+	return addr == tipc_own_addr(net) || !addr;
+}
+
 bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr);
-char *tipc_addr_string_fill(char *string, u32 addr);
+void tipc_set_node_id(struct net *net, u8 *id);
+void tipc_set_node_addr(struct net *net, u32 addr);
+char *tipc_nodeid2string(char *str, u8 *id);
+u32 tipc_node_id2hash(u8 *id128);
 
 #endif
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 04fd91bb11d7..e92fed49e095 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -56,7 +56,9 @@ static int __net_init tipc_init_net(struct net *net)
 	int err;
 
 	tn->net_id = 4711;
-	tn->own_addr = 0;
+	tn->node_addr = 0;
+	memset(tn->node_id, 0, sizeof(tn->node_id));
+	memset(tn->node_id_string, 0, sizeof(tn->node_id_string));
 	tn->mon_threshold = TIPC_DEF_MON_THRESHOLD;
 	get_random_bytes(&tn->random, sizeof(int));
 	INIT_LIST_HEAD(&tn->node_list);
diff --git a/net/tipc/core.h b/net/tipc/core.h
index bd2b112680a4..eabad41cc832 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -72,13 +72,17 @@ struct tipc_monitor;
 #define NODE_HTABLE_SIZE       512
 #define MAX_BEARERS	         3
 #define TIPC_DEF_MON_THRESHOLD  32
+#define NODE_ID_LEN             16
+#define NODE_ID_STR_LEN        (NODE_ID_LEN * 2 + 1)
 
 extern unsigned int tipc_net_id __read_mostly;
 extern int sysctl_tipc_rmem[3] __read_mostly;
 extern int sysctl_tipc_named_timeout __read_mostly;
 
 struct tipc_net {
-	u32 own_addr;
+	u8  node_id[NODE_ID_LEN];
+	u32 node_addr;
+	char node_id_string[NODE_ID_STR_LEN];
 	int net_id;
 	int random;
 	bool legacy_addr_format;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 94d524018ca5..b4c4cd176b9b 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -118,13 +118,11 @@ static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src,
 static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
 			    struct tipc_media_addr *media_addr)
 {
-	char node_addr_str[16];
 	char media_addr_str[64];
 
-	tipc_addr_string_fill(node_addr_str, node_addr);
 	tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str),
 			       media_addr);
-	pr_warn("Duplicate %s using %s seen on <%s>\n", node_addr_str,
+	pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr,
 		media_addr_str, b->name);
 }
 
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 4aa56e3bf4fc..bcd76b1e440c 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -442,6 +442,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 		      struct sk_buff_head *namedq,
 		      struct tipc_link **link)
 {
+	char *self_str = tipc_own_id_string(net);
 	struct tipc_link *l;
 
 	l = kzalloc(sizeof(*l), GFP_ATOMIC);
@@ -451,7 +452,10 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 	l->session = session;
 
 	/* Note: peer i/f name is completed by reset/activate message */
-	sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer);
+	if (strlen(self_str) > 16)
+		sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer);
+	else
+		sprintf(l->name, "%s:%s-%x:unknown", self_str, if_name, peer);
 	strcpy(l->if_name, if_name);
 	l->addr = peer;
 	l->peer_caps = peer_caps;
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 7e571f4f47bc..8240a85b0d0c 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -318,7 +318,6 @@ void tipc_named_process_backlog(struct net *net)
 {
 	struct distr_queue_item *e, *tmp;
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	char addr[16];
 	unsigned long now = get_jiffies_64();
 
 	list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) {
@@ -326,12 +325,11 @@ void tipc_named_process_backlog(struct net *net)
 			if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype))
 				continue;
 		} else {
-			tipc_addr_string_fill(addr, e->node);
-			pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n",
+			pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %x key=%u\n",
 					    e->dtype, ntohl(e->i.type),
 					    ntohl(e->i.lower),
 					    ntohl(e->i.upper),
-					    addr, ntohl(e->i.key));
+					    e->node, ntohl(e->i.key));
 		}
 		list_del(&e->next);
 		kfree(e);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 7f140a5308ee..e78674891166 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -104,27 +104,31 @@
  *     - A local spin_lock protecting the queue of subscriber events.
 */
 
-int tipc_net_start(struct net *net, u32 addr)
+int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
 {
-	struct tipc_net *tn = tipc_net(net);
-	char addr_string[16];
+	if (tipc_own_id(net)) {
+		pr_info("Cannot configure node identity twice\n");
+		return -1;
+	}
+	pr_info("Started in network mode\n");
 
-	tn->own_addr = addr;
+	if (node_id) {
+		tipc_set_node_id(net, node_id);
+		tipc_net_finalize(net, tipc_own_addr(net));
+	}
+	if (addr)
+		tipc_net_finalize(net, addr);
+	return 0;
+}
 
-	/* Ensure that the new address is visible before we reinit. */
+void tipc_net_finalize(struct net *net, u32 addr)
+{
+	tipc_set_node_addr(net, addr);
 	smp_mb();
-
 	tipc_named_reinit(net);
 	tipc_sk_reinit(net);
-
 	tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
 			     TIPC_CLUSTER_SCOPE, 0, addr);
-
-	pr_info("Started in network mode\n");
-	pr_info("Own node address %s, cluster identity %u\n",
-		tipc_addr_string_fill(addr_string, addr),
-		tn->net_id);
-	return 0;
 }
 
 void tipc_net_stop(struct net *net)
@@ -146,8 +150,10 @@ void tipc_net_stop(struct net *net)
 static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	void *hdr;
+	u64 *w0 = (u64 *)&tn->node_id[0];
+	u64 *w1 = (u64 *)&tn->node_id[8];
 	struct nlattr *attrs;
+	void *hdr;
 
 	hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
 			  NLM_F_MULTI, TIPC_NL_NET_GET);
@@ -160,7 +166,10 @@ static int __tipc_nl_add_net(struct net *net, struct tipc_nl_msg *msg)
 
 	if (nla_put_u32(msg->skb, TIPC_NLA_NET_ID, tn->net_id))
 		goto attr_msg_full;
-
+	if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID, *w0, 0))
+		goto attr_msg_full;
+	if (nla_put_u64_64bit(msg->skb, TIPC_NLA_NET_NODEID_W1, *w1, 0))
+		goto attr_msg_full;
 	nla_nest_end(msg->skb, attrs);
 	genlmsg_end(msg->skb, hdr);
 
@@ -212,6 +221,7 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 	err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX,
 			       info->attrs[TIPC_NLA_NET], tipc_nl_net_policy,
 			       info->extack);
+
 	if (err)
 		return err;
 
@@ -236,9 +246,18 @@ int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info)
 		if (!addr)
 			return -EINVAL;
 		tn->legacy_addr_format = true;
-		tipc_net_start(net, addr);
+		tipc_net_init(net, NULL, addr);
 	}
 
+	if (attrs[TIPC_NLA_NET_NODEID]) {
+		u8 node_id[NODE_ID_LEN];
+		u64 *w0 = (u64 *)&node_id[0];
+		u64 *w1 = (u64 *)&node_id[8];
+
+		*w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]);
+		*w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]);
+		tipc_net_init(net, node_id, 0);
+	}
 	return 0;
 }
 
diff --git a/net/tipc/net.h b/net/tipc/net.h
index c0306aa2374b..08efa6010022 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -41,10 +41,8 @@
 
 extern const struct nla_policy tipc_nl_net_policy[];
 
-int tipc_net_start(struct net *net, u32 addr);
-
+void tipc_net_finalize(struct net *net, u32 addr);
 void tipc_net_stop(struct net *net);
-
 int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb);
 int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info);
 int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 8a4b04933ecc..7b0c99347406 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -883,11 +883,9 @@ void tipc_node_delete_links(struct net *net, int bearer_id)
 
 static void tipc_node_reset_links(struct tipc_node *n)
 {
-	char addr_string[16];
 	int i;
 
-	pr_warn("Resetting all links to %s\n",
-		tipc_addr_string_fill(addr_string, n->addr));
+	pr_warn("Resetting all links to %x\n", n->addr);
 
 	for (i = 0; i < MAX_BEARERS; i++) {
 		tipc_node_link_down(n, i, false);
@@ -1074,15 +1072,13 @@ illegal_evt:
 static void node_lost_contact(struct tipc_node *n,
 			      struct sk_buff_head *inputq)
 {
-	char addr_string[16];
 	struct tipc_sock_conn *conn, *safe;
 	struct tipc_link *l;
 	struct list_head *conns = &n->conn_sks;
 	struct sk_buff *skb;
 	uint i;
 
-	pr_debug("Lost contact with %s\n",
-		 tipc_addr_string_fill(addr_string, n->addr));
+	pr_debug("Lost contact with %x\n", n->addr);
 
 	/* Clean up broadcast state */
 	tipc_bcast_remove_peer(n->net, n->bc_entry.link);
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 5fb38cf0bb5c..e06faf4fa55e 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -49,14 +49,14 @@ enum {
 	TIPC_BCAST_STATE_NACK = (1 << 2),
 	TIPC_BLOCK_FLOWCTL    = (1 << 3),
 	TIPC_BCAST_RCAST      = (1 << 4),
-	TIPC_NODE_ID32        = (1 << 5)
+	TIPC_NODE_ID128       = (1 << 5)
 };
 
 #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \
 				TIPC_BCAST_STATE_NACK | \
 				TIPC_BCAST_RCAST | \
 				TIPC_BLOCK_FLOWCTL | \
-				TIPC_NODE_ID32)
+				TIPC_NODE_ID128)
 #define INVALID_BEARER_ID -1
 
 void tipc_node_stop(struct net *net);
-- 
cgit v1.2.3


From 25b0b9c4e835ffaa65b61c3efe2e28acf84d0259 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:51 +0100
Subject: tipc: handle collisions of 32-bit node address hash values

When a 32-bit node address is generated from a 128-bit identifier,
there is a risk of collisions which must be discovered and handled.

We do this as follows:
- We don't apply the generated address immediately to the node, but do
  instead initiate a 1 sec trial period to allow other cluster members
  to discover and handle such collisions.

- During the trial period the node periodically sends out a new type
  of message, DSC_TRIAL_MSG, using broadcast or emulated broadcast,
  to all the other nodes in the cluster.

- When a node is receiving such a message, it must check that the
  presented 32-bit identifier either is unused, or was used by the very
  same peer in a previous session. In both cases it accepts the request
  by not responding to it.

- If it finds that the same node has been up before using a different
  address, it responds with a DSC_TRIAL_FAIL_MSG containing that
  address.

- If it finds that the address has already been taken by some other
  node, it generates a new, unused address and returns it to the
  requester.

- During the trial period the requesting node must always be prepared
  to accept a failure message, i.e., a message where a peer suggests a
  different (or equal)  address to the one tried. In those cases it
  must apply the suggested value as trial address and restart the trial
  period.

This algorithm ensures that in the vast majority of cases a node will
have the same address before and after a reboot. If a legacy user
configures the address explicitly, there will be no trial period and
messages, so this protocol addition is completely backwards compatible.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/addr.c     |   3 +-
 net/tipc/bearer.c   |   3 +-
 net/tipc/core.c     |   2 +
 net/tipc/core.h     |   2 +
 net/tipc/discover.c | 126 ++++++++++++++++++++++++++++++++++++++++++++--------
 net/tipc/link.c     |  26 +++++++----
 net/tipc/link.h     |   4 +-
 net/tipc/msg.h      |  23 +++++++++-
 net/tipc/net.c      |   4 +-
 net/tipc/node.c     |  85 ++++++++++++++++++++++++++++++++---
 net/tipc/node.h     |   3 +-
 11 files changed, 236 insertions(+), 45 deletions(-)

(limited to 'net')

diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index 4841e98591d0..b88d48d00913 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -59,7 +59,7 @@ void tipc_set_node_id(struct net *net, u8 *id)
 
 	memcpy(tn->node_id, id, NODE_ID_LEN);
 	tipc_nodeid2string(tn->node_id_string, id);
-	tn->node_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
+	tn->trial_addr = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
 	pr_info("Own node identity %s, cluster identity %u\n",
 		tipc_own_id_string(net), tn->net_id);
 }
@@ -74,6 +74,7 @@ void tipc_set_node_addr(struct net *net, u32 addr)
 		sprintf(node_id, "%x", addr);
 		tipc_set_node_id(net, node_id);
 	}
+	tn->trial_addr = addr;
 	pr_info("32-bit node address hash set to %x\n", addr);
 }
 
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index a71f31879cb3..ae5b44ca1c1e 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -235,7 +235,6 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 {
 	struct tipc_net *tn = tipc_net(net);
 	struct tipc_bearer_names b_names;
-	u32 self = tipc_own_addr(net);
 	int with_this_prio = 1;
 	struct tipc_bearer *b;
 	struct tipc_media *m;
@@ -244,7 +243,7 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 	int res = -EINVAL;
 	char *errstr = "";
 
-	if (!self) {
+	if (!tipc_own_id(net)) {
 		errstr = "not supported in standalone mode";
 		res = -ENOPROTOOPT;
 		goto rejected;
diff --git a/net/tipc/core.c b/net/tipc/core.c
index e92fed49e095..52dfc51ac4d5 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -57,6 +57,8 @@ static int __net_init tipc_init_net(struct net *net)
 
 	tn->net_id = 4711;
 	tn->node_addr = 0;
+	tn->trial_addr = 0;
+	tn->addr_trial_end = 0;
 	memset(tn->node_id, 0, sizeof(tn->node_id));
 	memset(tn->node_id_string, 0, sizeof(tn->node_id_string));
 	tn->mon_threshold = TIPC_DEF_MON_THRESHOLD;
diff --git a/net/tipc/core.h b/net/tipc/core.h
index eabad41cc832..d0f64ca62d02 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -82,6 +82,8 @@ extern int sysctl_tipc_named_timeout __read_mostly;
 struct tipc_net {
 	u8  node_id[NODE_ID_LEN];
 	u32 node_addr;
+	u32 trial_addr;
+	unsigned long addr_trial_end;
 	char node_id_string[NODE_ID_STR_LEN];
 	int net_id;
 	int random;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index b4c4cd176b9b..e7655736abed 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -1,7 +1,7 @@
 /*
  * net/tipc/discover.c
  *
- * Copyright (c) 2003-2006, 2014-2015, Ericsson AB
+ * Copyright (c) 2003-2006, 2014-2018, Ericsson AB
  * Copyright (c) 2005-2006, 2010-2011, Wind River Systems
  * All rights reserved.
  *
@@ -78,34 +78,40 @@ struct tipc_discoverer {
  * @b: ptr to bearer issuing message
  */
 static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb,
-			       u32 mtyp, struct tipc_bearer *b)
+			       u32 mtyp,  struct tipc_bearer *b)
 {
 	struct tipc_net *tn = tipc_net(net);
-	u32 self = tipc_own_addr(net);
 	u32 dest_domain = b->domain;
 	struct tipc_msg *hdr;
 
 	hdr = buf_msg(skb);
-	tipc_msg_init(self, hdr, LINK_CONFIG, mtyp,
+	tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp,
 		      MAX_H_SIZE, dest_domain);
+	msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN);
 	msg_set_non_seq(hdr, 1);
 	msg_set_node_sig(hdr, tn->random);
 	msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES);
 	msg_set_dest_domain(hdr, dest_domain);
 	msg_set_bc_netid(hdr, tn->net_id);
 	b->media->addr2msg(msg_media_addr(hdr), &b->addr);
+	msg_set_node_id(hdr, tipc_own_id(net));
 }
 
-static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src,
+static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst,
+			       u32 src, u32 sugg_addr,
 			       struct tipc_media_addr *maddr,
 			       struct tipc_bearer *b)
 {
+	struct tipc_msg *hdr;
 	struct sk_buff *skb;
 
-	skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
+	skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC);
 	if (!skb)
 		return;
+	hdr = buf_msg(skb);
 	tipc_disc_init_msg(net, skb, mtyp, b);
+	msg_set_sugg_node_addr(hdr, sugg_addr);
+	msg_set_dest_domain(hdr, dst);
 	tipc_bearer_xmit_skb(net, b->identity, skb, maddr);
 }
 
@@ -126,6 +132,52 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
 		media_addr_str, b->name);
 }
 
+/* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer
+ */
+bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
+			      struct tipc_media_addr *maddr,
+			      struct tipc_bearer *b,
+			      u32 dst, u32 src,
+			      u32 sugg_addr,
+			      u8 *peer_id,
+			      int mtyp)
+{
+	struct net *net = d->net;
+	struct tipc_net *tn = tipc_net(net);
+	bool trial = time_before(jiffies, tn->addr_trial_end);
+	u32 self = tipc_own_addr(net);
+
+	if (mtyp == DSC_TRIAL_FAIL_MSG) {
+		if (!trial)
+			return true;
+
+		/* Ignore if somebody else already gave new suggestion */
+		if (dst != tn->trial_addr)
+			return true;
+
+		/* Otherwise update trial address and restart trial period */
+		tn->trial_addr = sugg_addr;
+		msg_set_prevnode(buf_msg(d->skb), sugg_addr);
+		tn->addr_trial_end = jiffies + msecs_to_jiffies(1000);
+		return true;
+	}
+
+	/* Apply trial address if we just left trial period */
+	if (!trial && !self) {
+		tipc_net_finalize(net, tn->trial_addr);
+		msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
+	}
+
+	if (mtyp != DSC_TRIAL_MSG)
+		return false;
+
+	sugg_addr = tipc_node_try_addr(net, peer_id, src);
+	if (sugg_addr)
+		tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src,
+				   self, sugg_addr, maddr, b);
+	return true;
+}
+
 /**
  * tipc_disc_rcv - handle incoming discovery message (request or response)
  * @net: applicable net namespace
@@ -139,17 +191,27 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
 	struct tipc_msg *hdr = buf_msg(skb);
 	u16 caps = msg_node_capabilities(hdr);
 	bool legacy = tn->legacy_addr_format;
+	u32 sugg = msg_sugg_node_addr(hdr);
 	u32 signature = msg_node_sig(hdr);
+	u8 peer_id[NODE_ID_LEN] = {0,};
 	u32 dst = msg_dest_domain(hdr);
 	u32 net_id = msg_bc_netid(hdr);
-	u32 self = tipc_own_addr(net);
 	struct tipc_media_addr maddr;
 	u32 src = msg_prevnode(hdr);
 	u32 mtyp = msg_type(hdr);
 	bool dupl_addr = false;
 	bool respond = false;
+	u32 self;
 	int err;
 
+	skb_linearize(skb);
+	hdr = buf_msg(skb);
+
+	if (caps & TIPC_NODE_ID128)
+		memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN);
+	else
+		sprintf(peer_id, "%x", src);
+
 	err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr));
 	kfree_skb(skb);
 	if (err || maddr.broadcast) {
@@ -161,6 +223,12 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
 		return;
 	if (net_id != tn->net_id)
 		return;
+	if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst,
+				     src, sugg, peer_id, mtyp))
+		return;
+	self = tipc_own_addr(net);
+
+	/* Message from somebody using this node's address */
 	if (in_own_node(net, src)) {
 		disc_dupl_alert(b, self, &maddr);
 		return;
@@ -169,8 +237,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
 		return;
 	if (!tipc_in_scope(legacy, b->domain, src))
 		return;
-
-	tipc_node_check_dest(net, src, b, caps, signature,
+	tipc_node_check_dest(net, src, peer_id, b, caps, signature,
 			     &maddr, &respond, &dupl_addr);
 	if (dupl_addr)
 		disc_dupl_alert(b, src, &maddr);
@@ -178,7 +245,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb,
 		return;
 	if (mtyp != DSC_REQ_MSG)
 		return;
-	tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, &maddr, b);
+	tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b);
 }
 
 /* tipc_disc_add_dest - increment set of discovered nodes
@@ -216,9 +283,11 @@ void tipc_disc_remove_dest(struct tipc_discoverer *d)
 static void tipc_disc_timeout(struct timer_list *t)
 {
 	struct tipc_discoverer *d = from_timer(d, t, timer);
+	struct tipc_net *tn = tipc_net(d->net);
+	u32 self = tipc_own_addr(d->net);
 	struct tipc_media_addr maddr;
 	struct sk_buff *skb = NULL;
-	struct net *net;
+	struct net *net = d->net;
 	u32 bearer_id;
 
 	spin_lock_bh(&d->lock);
@@ -228,16 +297,29 @@ static void tipc_disc_timeout(struct timer_list *t)
 		d->timer_intv = TIPC_DISC_INACTIVE;
 		goto exit;
 	}
+
+	/* Did we just leave the address trial period ? */
+	if (!self && !time_before(jiffies, tn->addr_trial_end)) {
+		self = tn->trial_addr;
+		tipc_net_finalize(net, self);
+		msg_set_prevnode(buf_msg(d->skb), self);
+		msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
+	}
+
 	/* Adjust timeout interval according to discovery phase */
-	d->timer_intv *= 2;
-	if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW)
-		d->timer_intv = TIPC_DISC_SLOW;
-	else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST)
-		d->timer_intv = TIPC_DISC_FAST;
+	if (time_before(jiffies, tn->addr_trial_end)) {
+		d->timer_intv = TIPC_DISC_INIT;
+	} else {
+		d->timer_intv *= 2;
+		if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW)
+			d->timer_intv = TIPC_DISC_SLOW;
+		else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST)
+			d->timer_intv = TIPC_DISC_FAST;
+	}
+
 	mod_timer(&d->timer, jiffies + d->timer_intv);
 	memcpy(&maddr, &d->dest, sizeof(maddr));
 	skb = skb_clone(d->skb, GFP_ATOMIC);
-	net = d->net;
 	bearer_id = d->bearer_id;
 exit:
 	spin_unlock_bh(&d->lock);
@@ -257,18 +339,24 @@ exit:
 int tipc_disc_create(struct net *net, struct tipc_bearer *b,
 		     struct tipc_media_addr *dest, struct sk_buff **skb)
 {
+	struct tipc_net *tn = tipc_net(net);
 	struct tipc_discoverer *d;
 
 	d = kmalloc(sizeof(*d), GFP_ATOMIC);
 	if (!d)
 		return -ENOMEM;
-	d->skb = tipc_buf_acquire(MAX_H_SIZE, GFP_ATOMIC);
+	d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC);
 	if (!d->skb) {
 		kfree(d);
 		return -ENOMEM;
 	}
-
 	tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b);
+
+	/* Do we need an address trial period first ? */
+	if (!tipc_own_addr(net)) {
+		tn->addr_trial_end = jiffies + msecs_to_jiffies(1000);
+		msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG);
+	}
 	memcpy(&d->dest, dest, sizeof(*dest));
 	d->net = net;
 	d->bearer_id = b->identity;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index bcd76b1e440c..1289b4ba404f 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -434,15 +434,16 @@ char *tipc_link_name(struct tipc_link *l)
  */
 bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 		      int tolerance, char net_plane, u32 mtu, int priority,
-		      int window, u32 session, u32 self, u32 peer,
-		      u16 peer_caps,
+		      int window, u32 session, u32 self,
+		      u32 peer, u8 *peer_id, u16 peer_caps,
 		      struct tipc_link *bc_sndlink,
 		      struct tipc_link *bc_rcvlink,
 		      struct sk_buff_head *inputq,
 		      struct sk_buff_head *namedq,
 		      struct tipc_link **link)
 {
-	char *self_str = tipc_own_id_string(net);
+	char peer_str[NODE_ID_STR_LEN] = {0,};
+	char self_str[NODE_ID_STR_LEN] = {0,};
 	struct tipc_link *l;
 
 	l = kzalloc(sizeof(*l), GFP_ATOMIC);
@@ -451,11 +452,18 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 	*link = l;
 	l->session = session;
 
-	/* Note: peer i/f name is completed by reset/activate message */
-	if (strlen(self_str) > 16)
-		sprintf(l->name, "%x:%s-%x:unknown", self, if_name, peer);
-	else
-		sprintf(l->name, "%s:%s-%x:unknown", self_str, if_name, peer);
+	/* Set link name for unicast links only */
+	if (peer_id) {
+		tipc_nodeid2string(self_str, tipc_own_id(net));
+		if (strlen(self_str) > 16)
+			sprintf(self_str, "%x", self);
+		tipc_nodeid2string(peer_str, peer_id);
+		if (strlen(peer_str) > 16)
+			sprintf(peer_str, "%x", peer);
+	}
+	/* Peer i/f name will be completed by reset/activate message */
+	sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);
+
 	strcpy(l->if_name, if_name);
 	l->addr = peer;
 	l->peer_caps = peer_caps;
@@ -503,7 +511,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer,
 	struct tipc_link *l;
 
 	if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window,
-			      0, ownnode, peer, peer_caps, bc_sndlink,
+			      0, ownnode, peer, NULL, peer_caps, bc_sndlink,
 			      NULL, inputq, namedq, link))
 		return false;
 
diff --git a/net/tipc/link.h b/net/tipc/link.h
index d1bd1787a768..ec59348a81e8 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -73,8 +73,8 @@ enum {
 
 bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 		      int tolerance, char net_plane, u32 mtu, int priority,
-		      int window, u32 session, u32 ownnode, u32 peer,
-		      u16 peer_caps,
+		      int window, u32 session, u32 ownnode,
+		      u32 peer, u8 *peer_id, u16 peer_caps,
 		      struct tipc_link *bc_sndlink,
 		      struct tipc_link *bc_rcvlink,
 		      struct sk_buff_head *inputq,
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index b4ba1b4f9ae7..a4e944d59394 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -550,6 +550,8 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n)
  */
 #define DSC_REQ_MSG		0
 #define DSC_RESP_MSG		1
+#define DSC_TRIAL_MSG		2
+#define DSC_TRIAL_FAIL_MSG	3
 
 /*
  * Group protocol message types
@@ -627,7 +629,6 @@ static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n)
 	msg_set_bits(m, 2, 0, 0xffff, n);
 }
 
-
 /*
  * Word 4
  */
@@ -925,6 +926,26 @@ static inline bool msg_is_reset(struct tipc_msg *hdr)
 	return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG);
 }
 
+static inline u32 msg_sugg_node_addr(struct tipc_msg *m)
+{
+	return msg_word(m, 14);
+}
+
+static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n)
+{
+	msg_set_word(m, 14, n);
+}
+
+static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id)
+{
+	memcpy(msg_data(hdr), id, 16);
+}
+
+static inline u8 *msg_node_id(struct tipc_msg *hdr)
+{
+	return (u8 *)msg_data(hdr);
+}
+
 struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp);
 bool tipc_msg_validate(struct sk_buff **_skb);
 bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index e78674891166..29538dc00857 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -112,10 +112,8 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
 	}
 	pr_info("Started in network mode\n");
 
-	if (node_id) {
+	if (node_id)
 		tipc_set_node_id(net, node_id);
-		tipc_net_finalize(net, tipc_own_addr(net));
-	}
 	if (addr)
 		tipc_net_finalize(net, addr);
 	return 0;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 7b0c99347406..4a95c8c155c6 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -115,6 +115,7 @@ struct tipc_node {
 	u16 capabilities;
 	u32 signature;
 	u32 link_id;
+	u8 peer_id[16];
 	struct list_head publ_list;
 	struct list_head conn_sks;
 	unsigned long keepalive_intv;
@@ -156,6 +157,7 @@ static void tipc_node_delete(struct tipc_node *node);
 static void tipc_node_timeout(struct timer_list *t);
 static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
 static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
+static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id);
 static void tipc_node_put(struct tipc_node *node);
 static bool node_is_up(struct tipc_node *n);
 
@@ -245,6 +247,30 @@ static struct tipc_node *tipc_node_find(struct net *net, u32 addr)
 	return node;
 }
 
+/* tipc_node_find_by_id - locate specified node object by its 128-bit id
+ * Note: this function is called only when a discovery request failed
+ * to find the node by its 32-bit id, and is not time critical
+ */
+static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id)
+{
+	struct tipc_net *tn = tipc_net(net);
+	struct tipc_node *n;
+	bool found = false;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(n, &tn->node_list, list) {
+		read_lock_bh(&n->lock);
+		if (!memcmp(id, n->peer_id, 16) &&
+		    kref_get_unless_zero(&n->kref))
+			found = true;
+		read_unlock_bh(&n->lock);
+		if (found)
+			break;
+	}
+	rcu_read_unlock();
+	return found ? n : NULL;
+}
+
 static void tipc_node_read_lock(struct tipc_node *n)
 {
 	read_lock_bh(&n->lock);
@@ -307,7 +333,8 @@ static void tipc_node_write_unlock(struct tipc_node *n)
 	}
 }
 
-struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
+struct tipc_node *tipc_node_create(struct net *net, u32 addr,
+				   u8 *peer_id, u16 capabilities)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_node *n, *temp_node;
@@ -326,6 +353,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
 		goto exit;
 	}
 	n->addr = addr;
+	memcpy(&n->peer_id, peer_id, 16);
 	n->net = net;
 	n->capabilities = capabilities;
 	kref_init(&n->kref);
@@ -344,8 +372,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
 	n->signature = INVALID_NODE_SIG;
 	n->active_links[0] = INVALID_BEARER_ID;
 	n->active_links[1] = INVALID_BEARER_ID;
-	if (!tipc_link_bc_create(net, tipc_own_addr(net), n->addr,
-				 U16_MAX,
+	if (!tipc_link_bc_create(net, tipc_own_addr(net),
+				 addr, U16_MAX,
 				 tipc_link_window(tipc_bc_sndlink(net)),
 				 n->capabilities,
 				 &n->bc_entry.inputq1,
@@ -735,8 +763,51 @@ bool tipc_node_is_up(struct net *net, u32 addr)
 	return retval;
 }
 
-void tipc_node_check_dest(struct net *net, u32 onode,
-			  struct tipc_bearer *b,
+static u32 tipc_node_suggest_addr(struct net *net, u32 addr)
+{
+	struct tipc_node *n;
+
+	addr ^= tipc_net(net)->random;
+	while ((n = tipc_node_find(net, addr))) {
+		tipc_node_put(n);
+		addr++;
+	}
+	return addr;
+}
+
+/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not
+ */
+u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
+{
+	struct tipc_net *tn = tipc_net(net);
+	struct tipc_node *n;
+
+	/* Suggest new address if some other peer is using this one */
+	n = tipc_node_find(net, addr);
+	if (n) {
+		if (!memcmp(n->peer_id, id, NODE_ID_LEN))
+			addr = 0;
+		tipc_node_put(n);
+		if (!addr)
+			return 0;
+		return tipc_node_suggest_addr(net, addr);
+	}
+
+	/* Suggest previously used address if peer is known */
+	n = tipc_node_find_by_id(net, id);
+	if (n) {
+		addr = n->addr;
+		tipc_node_put(n);
+	}
+	/* Even this node may be in trial phase */
+	if (tn->trial_addr == addr)
+		return tipc_node_suggest_addr(net, addr);
+
+	return addr;
+}
+
+void tipc_node_check_dest(struct net *net, u32 addr,
+			  u8 *peer_id, struct tipc_bearer *b,
 			  u16 capabilities, u32 signature,
 			  struct tipc_media_addr *maddr,
 			  bool *respond, bool *dupl_addr)
@@ -755,7 +826,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
 	*dupl_addr = false;
 	*respond = false;
 
-	n = tipc_node_create(net, onode, capabilities);
+	n = tipc_node_create(net, addr, peer_id, capabilities);
 	if (!n)
 		return;
 
@@ -840,7 +911,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
 		if (!tipc_link_create(net, if_name, b->identity, b->tolerance,
 				      b->net_plane, b->mtu, b->priority,
 				      b->window, mod(tipc_net(net)->random),
-				      tipc_own_addr(net), onode,
+				      tipc_own_addr(net), addr, peer_id,
 				      n->capabilities,
 				      tipc_bc_sndlink(n->net), n->bc_entry.link,
 				      &le->inputq,
diff --git a/net/tipc/node.h b/net/tipc/node.h
index e06faf4fa55e..f24b83500df1 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -60,7 +60,8 @@ enum {
 #define INVALID_BEARER_ID -1
 
 void tipc_node_stop(struct net *net);
-void tipc_node_check_dest(struct net *net, u32 onode,
+u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr);
+void tipc_node_check_dest(struct net *net, u32 onode, u8 *peer_id128,
 			  struct tipc_bearer *bearer,
 			  u16 capabilities, u32 signature,
 			  struct tipc_media_addr *maddr,
-- 
cgit v1.2.3


From 52dfae5c85a4c1078e9f1d5e8947d4a25f73dd81 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 22 Mar 2018 20:42:52 +0100
Subject: tipc: obtain node identity from interface by default

Selecting and explicitly configuring a TIPC node identity may be
unwanted in some cases.

In this commit we introduce a default setting if the identity has not
been set at the moment the first bearer is enabled. We do this by
using a raw copy of a unique identifier from the used interface: MAC
address in the case of an L2 bearer, IPv4/IPv6 address in the case
of a UDP bearer.

Acked-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/bearer.c    | 24 +++++++++++++++---------
 net/tipc/net.h       |  1 +
 net/tipc/udp_media.c | 13 +++++++++++++
 3 files changed, 29 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index ae5b44ca1c1e..f7d47c89d658 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -243,12 +243,6 @@ static int tipc_enable_bearer(struct net *net, const char *name,
 	int res = -EINVAL;
 	char *errstr = "";
 
-	if (!tipc_own_id(net)) {
-		errstr = "not supported in standalone mode";
-		res = -ENOPROTOOPT;
-		goto rejected;
-	}
-
 	if (!bearer_name_validate(name, &b_names)) {
 		errstr = "illegal name";
 		goto rejected;
@@ -381,11 +375,13 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b)
 int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
 			 struct nlattr *attr[])
 {
+	char *dev_name = strchr((const char *)b->name, ':') + 1;
+	int hwaddr_len = b->media->hwaddr_len;
+	u8 node_id[NODE_ID_LEN] = {0,};
 	struct net_device *dev;
-	char *driver_name = strchr((const char *)b->name, ':') + 1;
 
 	/* Find device with specified name */
-	dev = dev_get_by_name(net, driver_name);
+	dev = dev_get_by_name(net, dev_name);
 	if (!dev)
 		return -ENODEV;
 	if (tipc_mtu_bad(dev, 0)) {
@@ -393,6 +389,16 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
 		return -EINVAL;
 	}
 
+	/* Autoconfigure own node identity if needed */
+	if (!tipc_own_id(net) && hwaddr_len <= NODE_ID_LEN) {
+		memcpy(node_id, dev->dev_addr, hwaddr_len);
+		tipc_net_init(net, node_id, 0);
+	}
+	if (!tipc_own_id(net)) {
+		pr_warn("Failed to obtain node identity\n");
+		return -EINVAL;
+	}
+
 	/* Associate TIPC bearer with L2 bearer */
 	rcu_assign_pointer(b->media_ptr, dev);
 	b->pt.dev = dev;
@@ -400,7 +406,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
 	b->pt.func = tipc_l2_rcv_msg;
 	dev_add_pack(&b->pt);
 	memset(&b->bcast_addr, 0, sizeof(b->bcast_addr));
-	memcpy(b->bcast_addr.value, dev->broadcast, b->media->hwaddr_len);
+	memcpy(b->bcast_addr.value, dev->broadcast, hwaddr_len);
 	b->bcast_addr.media_id = b->media->type_id;
 	b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
 	b->mtu = dev->mtu;
diff --git a/net/tipc/net.h b/net/tipc/net.h
index 08efa6010022..09ad02b50bb1 100644
--- a/net/tipc/net.h
+++ b/net/tipc/net.h
@@ -41,6 +41,7 @@
 
 extern const struct nla_policy tipc_nl_net_policy[];
 
+int tipc_net_init(struct net *net, u8 *node_id, u32 addr);
 void tipc_net_finalize(struct net *net, u32 addr);
 void tipc_net_stop(struct net *net);
 int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 3deabcab4882..2c13b18426d9 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -47,6 +47,8 @@
 #include <net/addrconf.h>
 #include <linux/tipc_netlink.h>
 #include "core.h"
+#include "addr.h"
+#include "net.h"
 #include "bearer.h"
 #include "netlink.h"
 #include "msg.h"
@@ -647,6 +649,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	struct udp_port_cfg udp_conf = {0};
 	struct udp_tunnel_sock_cfg tuncfg = {NULL};
 	struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
+	u8 node_id[NODE_ID_LEN] = {0,};
 
 	ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
 	if (!ub)
@@ -677,6 +680,16 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	if (err)
 		goto err;
 
+	/* Autoconfigure own node identity if needed */
+	if (!tipc_own_id(net)) {
+		memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16);
+		tipc_net_init(net, node_id, 0);
+	}
+	if (!tipc_own_id(net)) {
+		pr_warn("Failed to set node id, please configure manually\n");
+		return -EINVAL;
+	}
+
 	b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
 	b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
 	rcu_assign_pointer(b->media_ptr, ub);
-- 
cgit v1.2.3


From affaa0c724c14c914625647efe7b95dfbe8d08f2 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 23 Mar 2018 19:09:39 +0100
Subject: net/sched: remove tcf_idr_cleanup()

tcf_idr_cleanup() is no more used, so remove it.

Suggested-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_api.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 57cf37145282..7bd1b964f021 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -296,14 +296,6 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
 }
 EXPORT_SYMBOL(tcf_idr_check);
 
-void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est)
-{
-	if (est)
-		gen_kill_estimator(&a->tcfa_rate_est);
-	free_tcf(a);
-}
-EXPORT_SYMBOL(tcf_idr_cleanup);
-
 int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		   struct tc_action **a, const struct tc_action_ops *ops,
 		   int bind, bool cpustats)
-- 
cgit v1.2.3


From 94cb5492409219ee3f9468616dd58af314029f76 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 23 Mar 2018 19:31:30 +0100
Subject: net/sched: act_vlan: declare push_vid with host byte order

use u16 in place of __be16 to suppress the following sparse warnings:

 net/sched/act_vlan.c:150:26: warning: incorrect type in assignment (different base types)
 net/sched/act_vlan.c:150:26: expected restricted __be16 [usertype] push_vid
 net/sched/act_vlan.c:150:26: got unsigned short
 net/sched/act_vlan.c:151:21: warning: restricted __be16 degrades to integer
 net/sched/act_vlan.c:208:26: warning: incorrect type in assignment (different base types)
 net/sched/act_vlan.c:208:26: expected unsigned short [unsigned] [usertype] tcfv_push_vid
 net/sched/act_vlan.c:208:26: got restricted __be16 [usertype] push_vid

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/act_vlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 4595391c2129..41a66effeb5f 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -117,7 +117,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
 	struct tc_vlan *parm;
 	struct tcf_vlan *v;
 	int action;
-	__be16 push_vid = 0;
+	u16 push_vid = 0;
 	__be16 push_proto = 0;
 	u8 push_prio = 0;
 	bool exists = false;
-- 
cgit v1.2.3


From 13acc94eff122b260a387d68668bf9d670738e6a Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 21 Mar 2018 16:31:03 -0700
Subject: net: permit skb_segment on head_frag frag_list skb

One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at
function skb_segment(), line 3667. The bpf program attaches to
clsact ingress, calls bpf_skb_change_proto to change protocol
from ipv4 to ipv6 or from ipv6 to ipv4, and then calls bpf_redirect
to send the changed packet out.

3472 struct sk_buff *skb_segment(struct sk_buff *head_skb,
3473                             netdev_features_t features)
3474 {
3475         struct sk_buff *segs = NULL;
3476         struct sk_buff *tail = NULL;
...
3665                 while (pos < offset + len) {
3666                         if (i >= nfrags) {
3667                                 BUG_ON(skb_headlen(list_skb));
3668
3669                                 i = 0;
3670                                 nfrags = skb_shinfo(list_skb)->nr_frags;
3671                                 frag = skb_shinfo(list_skb)->frags;
3672                                 frag_skb = list_skb;
...

call stack:
...
 #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525
 #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc
 #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7
 #4 [ffff883ffef03668] die at ffffffff8101deb2
 #5 [ffff883ffef03698] do_trap at ffffffff8101a700
 #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe
 #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0
 #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab
    [exception RIP: skb_segment+3044]
    RIP: ffffffff817e4dd4  RSP: ffff883ffef03860  RFLAGS: 00010216
    RAX: 0000000000002bf6  RBX: ffff883feb7aaa00  RCX: 0000000000000011
    RDX: ffff883fb87910c0  RSI: 0000000000000011  RDI: ffff883feb7ab500
    RBP: ffff883ffef03928   R8: 0000000000002ce2   R9: 00000000000027da
    R10: 000001ea00000000  R11: 0000000000002d82  R12: ffff883f90a1ee80
    R13: ffff883fb8791120  R14: ffff883feb7abc00  R15: 0000000000002ce2
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 46cb22215ff4..b5c75d4fcf37 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3460,6 +3460,19 @@ void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(skb_pull_rcsum);
 
+static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
+{
+	skb_frag_t head_frag;
+	struct page *page;
+
+	page = virt_to_head_page(frag_skb->head);
+	head_frag.page.p = page;
+	head_frag.page_offset = frag_skb->data -
+		(unsigned char *)page_address(page);
+	head_frag.size = skb_headlen(frag_skb);
+	return head_frag;
+}
+
 /**
  *	skb_segment - Perform protocol segmentation on skb.
  *	@head_skb: buffer to segment
@@ -3664,15 +3677,19 @@ normal:
 
 		while (pos < offset + len) {
 			if (i >= nfrags) {
-				BUG_ON(skb_headlen(list_skb));
-
 				i = 0;
 				nfrags = skb_shinfo(list_skb)->nr_frags;
 				frag = skb_shinfo(list_skb)->frags;
 				frag_skb = list_skb;
+				if (!skb_headlen(list_skb)) {
+					BUG_ON(!nfrags);
+				} else {
+					BUG_ON(!list_skb->head_frag);
 
-				BUG_ON(!nfrags);
-
+					/* to make room for head_frag. */
+					i--;
+					frag--;
+				}
 				if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
 				    skb_zerocopy_clone(nskb, frag_skb,
 						       GFP_ATOMIC))
@@ -3689,7 +3706,7 @@ normal:
 				goto err;
 			}
 
-			*nskb_frag = *frag;
+			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
 			__skb_frag_ref(nskb_frag);
 			size = skb_frag_size(nskb_frag);
 
-- 
cgit v1.2.3


From da18ab32d78b6414267d3e5c8c9b5ab34a6d3321 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Sat, 24 Mar 2018 03:47:42 +0800
Subject: tipc: tipc_disc_addr_trial_msg() can be static

Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address hash values")
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Jon Maloy jon.maloy@ericsson.com
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/discover.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index e7655736abed..9f666e0650e2 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -134,13 +134,13 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
 
 /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer
  */
-bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
-			      struct tipc_media_addr *maddr,
-			      struct tipc_bearer *b,
-			      u32 dst, u32 src,
-			      u32 sugg_addr,
-			      u8 *peer_id,
-			      int mtyp)
+static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
+				     struct tipc_media_addr *maddr,
+				     struct tipc_bearer *b,
+				     u32 dst, u32 src,
+				     u32 sugg_addr,
+				     u8 *peer_id,
+				     int mtyp)
 {
 	struct net *net = d->net;
 	struct tipc_net *tn = tipc_net(net);
-- 
cgit v1.2.3


From ede2762d93ff16e0974f7446516b46b1022db213 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 23 Mar 2018 19:47:19 +0300
Subject: net: Make NETDEV_XXX commands enum { }

This patch is preparation to drop NETDEV_UNREGISTER_FINAL.
Since the cmd is used in usnic_ib_netdev_event_to_string()
to get cmd name, after plain removing NETDEV_UNREGISTER_FINAL
from everywhere, we'd have holes in event2str[] in this
function.

Instead of that, let's make NETDEV_XXX commands names
available for everyone, and to define netdev_cmd_to_name()
in the way we won't have to shaffle names after their
numbers are changed.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index f9c28f44286c..055e7ae12759 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1571,6 +1571,26 @@ static void dev_disable_gro_hw(struct net_device *dev)
 		netdev_WARN(dev, "failed to disable GRO_HW!\n");
 }
 
+const char *netdev_cmd_to_name(enum netdev_cmd cmd)
+{
+#define N(val) 						\
+	case NETDEV_##val:				\
+		return "NETDEV_" __stringify(val);
+	switch (cmd) {
+	N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
+	N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
+	N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
+	N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
+	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
+	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
+	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+	N(UNREGISTER_FINAL)
+	};
+#undef N
+	return "UNKNOWN_NETDEV_EVENT";
+}
+EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
+
 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
 				   struct net_device *dev)
 {
-- 
cgit v1.2.3


From 070f2d7e264acd6316fc24092b7f51a18c75ac9c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 23 Mar 2018 19:47:39 +0300
Subject: net: Drop NETDEV_UNREGISTER_FINAL

Last user is gone after bdf5bd7f2132 "rds: tcp: remove
register_netdevice_notifier infrastructure.", so we can
remove this netdevice command. This allows to delete
rtnl_lock() in netdev_run_todo(), which is hot path for
net namespace unregistration.

dev_change_net_namespace() and netdev_wait_allrefs()
have rcu_barrier() before NETDEV_UNREGISTER_FINAL call,
and the source commits say they were introduced to
delemit the call with NETDEV_UNREGISTER, but this patch
leaves them on the places, since they require additional
analysis, whether we need in them for something else.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 055e7ae12759..97a96df4b6da 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1584,7 +1584,6 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
-	N(UNREGISTER_FINAL)
 	};
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
@@ -8097,7 +8096,6 @@ static void netdev_wait_allrefs(struct net_device *dev)
 			rcu_barrier();
 			rtnl_lock();
 
-			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
 				     &dev->state)) {
 				/* We must not have linkwatch events
@@ -8169,10 +8167,6 @@ void netdev_run_todo(void)
 			= list_first_entry(&list, struct net_device, todo_list);
 		list_del(&dev->todo_list);
 
-		rtnl_lock();
-		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
-		__rtnl_unlock();
-
 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
 			pr_err("network todo '%s' but state %d\n",
 			       dev->name, dev->reg_state);
@@ -8614,7 +8608,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 	 */
 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 	rcu_barrier();
-	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
 
 	new_nsid = peernet2id_alloc(dev_net(dev), net);
 	/* If there is an ifindex conflict assign a new one */
-- 
cgit v1.2.3


From d6444062f8f07c346a21bd815af4a3dc8b231574 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 23 Mar 2018 15:54:38 -0700
Subject: net: Use octal not symbolic permissions

Prefer the direct use of octal for permissions.

Done with checkpatch -f --types=SYMBOLIC_PERMS --fix-inplace
and some typing.

Miscellanea:

o Whitespace neatening around these conversions.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlanproc.c                       |  6 ++---
 net/appletalk/atalk_proc.c                 |  8 +++---
 net/atm/atm_sysfs.c                        | 12 ++++-----
 net/atm/clip.c                             |  2 +-
 net/atm/lec.c                              |  2 +-
 net/atm/proc.c                             |  2 +-
 net/ax25/af_ax25.c                         |  6 ++---
 net/bluetooth/rfcomm/tty.c                 |  4 +--
 net/bridge/br_sysfs_br.c                   |  2 +-
 net/bridge/br_sysfs_if.c                   | 36 ++++++++++++-------------
 net/can/af_can.c                           |  2 +-
 net/can/gw.c                               |  2 +-
 net/ceph/ceph_common.c                     |  2 +-
 net/core/net-procfs.c                      |  6 ++---
 net/core/net-sysfs.c                       | 12 ++++-----
 net/core/sock.c                            |  2 +-
 net/decnet/af_decnet.c                     |  2 +-
 net/decnet/dn_dev.c                        |  2 +-
 net/decnet/dn_neigh.c                      |  2 +-
 net/decnet/dn_route.c                      |  2 +-
 net/dns_resolver/dns_key.c                 |  2 +-
 net/ipv4/arp.c                             |  2 +-
 net/ipv4/fib_trie.c                        |  6 ++---
 net/ipv4/igmp.c                            |  4 +--
 net/ipv4/ipconfig.c                        |  2 +-
 net/ipv4/netfilter/ipt_CLUSTERIP.c         |  2 +-
 net/ipv4/ping.c                            |  2 +-
 net/ipv4/proc.c                            |  6 ++---
 net/ipv4/raw.c                             |  2 +-
 net/ipv4/route.c                           |  4 +--
 net/ipv4/tcp_ipv4.c                        |  2 +-
 net/ipv4/udp.c                             |  2 +-
 net/ipv6/addrconf.c                        |  2 +-
 net/ipv6/anycast.c                         |  2 +-
 net/ipv6/ip6_flowlabel.c                   |  2 +-
 net/ipv6/mcast.c                           |  4 +--
 net/ipv6/proc.c                            |  6 ++---
 net/ipv6/raw.c                             |  2 +-
 net/ipv6/route.c                           |  2 +-
 net/kcm/kcmproc.c                          |  4 +--
 net/l2tp/l2tp_ppp.c                        |  2 +-
 net/llc/llc_proc.c                         |  4 +--
 net/mac80211/rc80211_minstrel.c            |  2 +-
 net/mac80211/rc80211_minstrel_debugfs.c    |  8 +++---
 net/mac80211/rc80211_minstrel_ht_debugfs.c |  8 +++---
 net/netfilter/nf_conntrack_netbios_ns.c    |  2 +-
 net/netfilter/nf_conntrack_snmp.c          |  2 +-
 net/netfilter/nf_conntrack_standalone.c    |  2 +-
 net/netfilter/nf_log.c                     |  2 +-
 net/netfilter/nf_synproxy_core.c           |  2 +-
 net/netfilter/xt_IDLETIMER.c               |  2 +-
 net/netfilter/xt_recent.c                  |  4 +--
 net/netrom/af_netrom.c                     |  6 ++---
 net/rose/af_rose.c                         |  8 +++---
 net/rxrpc/af_rxrpc.c                       |  2 +-
 net/sctp/proc.c                            | 16 ++++++------
 net/sunrpc/auth_gss/svcauth_gss.c          |  2 +-
 net/sunrpc/cache.c                         | 10 +++----
 net/sunrpc/debugfs.c                       |  6 ++---
 net/sunrpc/rpc_pipe.c                      | 42 +++++++++++++++---------------
 net/wireless/wext-proc.c                   |  2 +-
 net/x25/x25_proc.c                         | 12 ++++-----
 net/xfrm/xfrm_proc.c                       |  2 +-
 63 files changed, 161 insertions(+), 161 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index a662ccc166df..a627a5db2125 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -148,8 +148,8 @@ int __net_init vlan_proc_init(struct net *net)
 	if (!vn->proc_vlan_dir)
 		goto err;
 
-	vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR,
-				     vn->proc_vlan_dir, &vlan_fops);
+	vn->proc_vlan_conf = proc_create(name_conf, S_IFREG | 0600,
+					 vn->proc_vlan_dir, &vlan_fops);
 	if (!vn->proc_vlan_conf)
 		goto err;
 	return 0;
@@ -172,7 +172,7 @@ int vlan_proc_add_dev(struct net_device *vlandev)
 	if (!strcmp(vlandev->name, name_conf))
 		return -EINVAL;
 	vlan->dent =
-		proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR,
+		proc_create_data(vlandev->name, S_IFREG | 0600,
 				 vn->proc_vlan_dir, &vlandev_fops, vlandev);
 	if (!vlan->dent)
 		return -ENOBUFS;
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index a3bf9d519193..7214aea14cb3 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -257,22 +257,22 @@ int __init atalk_proc_init(void)
 	if (!atalk_proc_dir)
 		goto out;
 
-	p = proc_create("interface", S_IRUGO, atalk_proc_dir,
+	p = proc_create("interface", 0444, atalk_proc_dir,
 			&atalk_seq_interface_fops);
 	if (!p)
 		goto out_interface;
 
-	p = proc_create("route", S_IRUGO, atalk_proc_dir,
+	p = proc_create("route", 0444, atalk_proc_dir,
 			&atalk_seq_route_fops);
 	if (!p)
 		goto out_route;
 
-	p = proc_create("socket", S_IRUGO, atalk_proc_dir,
+	p = proc_create("socket", 0444, atalk_proc_dir,
 			&atalk_seq_socket_fops);
 	if (!p)
 		goto out_socket;
 
-	p = proc_create("arp", S_IRUGO, atalk_proc_dir, &atalk_seq_arp_fops);
+	p = proc_create("arp", 0444, atalk_proc_dir, &atalk_seq_arp_fops);
 	if (!p)
 		goto out_arp;
 
diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c
index 5d2fed9f5710..39b94ca5f65d 100644
--- a/net/atm/atm_sysfs.c
+++ b/net/atm/atm_sysfs.c
@@ -96,12 +96,12 @@ static ssize_t show_link_rate(struct device *cdev,
 	return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate);
 }
 
-static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
-static DEVICE_ATTR(atmaddress, S_IRUGO, show_atmaddress, NULL);
-static DEVICE_ATTR(atmindex, S_IRUGO, show_atmindex, NULL);
-static DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL);
-static DEVICE_ATTR(type, S_IRUGO, show_type, NULL);
-static DEVICE_ATTR(link_rate, S_IRUGO, show_link_rate, NULL);
+static DEVICE_ATTR(address, 0444, show_address, NULL);
+static DEVICE_ATTR(atmaddress, 0444, show_atmaddress, NULL);
+static DEVICE_ATTR(atmindex, 0444, show_atmindex, NULL);
+static DEVICE_ATTR(carrier, 0444, show_carrier, NULL);
+static DEVICE_ATTR(type, 0444, show_type, NULL);
+static DEVICE_ATTR(link_rate, 0444, show_link_rate, NULL);
 
 static struct device_attribute *atm_attrs[] = {
 	&dev_attr_atmaddress,
diff --git a/net/atm/clip.c b/net/atm/clip.c
index d4f6029d5109..f07dbc632222 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -893,7 +893,7 @@ static int __init atm_clip_init(void)
 	{
 		struct proc_dir_entry *p;
 
-		p = proc_create("arp", S_IRUGO, atm_proc_root, &arp_seq_fops);
+		p = proc_create("arp", 0444, atm_proc_root, &arp_seq_fops);
 		if (!p) {
 			pr_err("Unable to initialize /proc/net/atm/arp\n");
 			atm_clip_exit_noproc();
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 09a1f056712a..01d5d20a6eb1 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1042,7 +1042,7 @@ static int __init lane_module_init(void)
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *p;
 
-	p = proc_create("lec", S_IRUGO, atm_proc_root, &lec_seq_fops);
+	p = proc_create("lec", 0444, atm_proc_root, &lec_seq_fops);
 	if (!p) {
 		pr_err("Unable to initialize /proc/net/atm/lec\n");
 		return -ENOMEM;
diff --git a/net/atm/proc.c b/net/atm/proc.c
index edc48edc95c1..55410c00c7e2 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -474,7 +474,7 @@ int __init atm_proc_init(void)
 	for (e = atm_proc_ents; e->name; e++) {
 		struct proc_dir_entry *dirent;
 
-		dirent = proc_create(e->name, S_IRUGO,
+		dirent = proc_create(e->name, 0444,
 				     atm_proc_root, e->proc_fops);
 		if (!dirent)
 			goto err_out_remove;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c8319ed48485..2b41366fcad2 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1989,10 +1989,10 @@ static int __init ax25_init(void)
 	dev_add_pack(&ax25_packet_type);
 	register_netdevice_notifier(&ax25_dev_notifier);
 
-	proc_create("ax25_route", S_IRUGO, init_net.proc_net,
+	proc_create("ax25_route", 0444, init_net.proc_net,
 		    &ax25_route_fops);
-	proc_create("ax25", S_IRUGO, init_net.proc_net, &ax25_info_fops);
-	proc_create("ax25_calls", S_IRUGO, init_net.proc_net, &ax25_uid_fops);
+	proc_create("ax25", 0444, init_net.proc_net, &ax25_info_fops);
+	proc_create("ax25_calls", 0444, init_net.proc_net, &ax25_uid_fops);
 out:
 	return rc;
 }
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 5f3074cb6b4d..5e44d842cc5d 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -210,8 +210,8 @@ static ssize_t show_channel(struct device *tty_dev, struct device_attribute *att
 	return sprintf(buf, "%d\n", dev->channel);
 }
 
-static DEVICE_ATTR(address, S_IRUGO, show_address, NULL);
-static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL);
+static DEVICE_ATTR(address, 0444, show_address, NULL);
+static DEVICE_ATTR(channel, 0444, show_channel, NULL);
 
 static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req,
 					   struct rfcomm_dlc *dlc)
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index b1be0dcfba6b..0318a69888d4 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -893,7 +893,7 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
 
 static struct bin_attribute bridge_forward = {
 	.attr = { .name = SYSFS_BRIDGE_FDB,
-		  .mode = S_IRUGO, },
+		  .mode = 0444, },
 	.read = brforward_read,
 };
 
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 126a8ea73c96..fd31ad83ec7b 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -44,7 +44,7 @@ static int store_##_name(struct net_bridge_port *p, unsigned long v) \
 {								\
 	return store_flag(p, v, _mask);				\
 }								\
-static BRPORT_ATTR(_name, S_IRUGO | S_IWUSR,			\
+static BRPORT_ATTR(_name, 0644,					\
 		   show_##_name, store_##_name)
 
 static int store_flag(struct net_bridge_port *p, unsigned long v,
@@ -71,7 +71,7 @@ static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
 	return sprintf(buf, "%d\n", p->path_cost);
 }
 
-static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR,
+static BRPORT_ATTR(path_cost, 0644,
 		   show_path_cost, br_stp_set_path_cost);
 
 static ssize_t show_priority(struct net_bridge_port *p, char *buf)
@@ -79,91 +79,91 @@ static ssize_t show_priority(struct net_bridge_port *p, char *buf)
 	return sprintf(buf, "%d\n", p->priority);
 }
 
-static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR,
+static BRPORT_ATTR(priority, 0644,
 			 show_priority, br_stp_set_port_priority);
 
 static ssize_t show_designated_root(struct net_bridge_port *p, char *buf)
 {
 	return br_show_bridge_id(buf, &p->designated_root);
 }
-static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL);
+static BRPORT_ATTR(designated_root, 0444, show_designated_root, NULL);
 
 static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf)
 {
 	return br_show_bridge_id(buf, &p->designated_bridge);
 }
-static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL);
+static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL);
 
 static ssize_t show_designated_port(struct net_bridge_port *p, char *buf)
 {
 	return sprintf(buf, "%d\n", p->designated_port);
 }
-static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL);
+static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL);
 
 static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf)
 {
 	return sprintf(buf, "%d\n", p->designated_cost);
 }
-static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL);
+static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL);
 
 static ssize_t show_port_id(struct net_bridge_port *p, char *buf)
 {
 	return sprintf(buf, "0x%x\n", p->port_id);
 }
-static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL);
+static BRPORT_ATTR(port_id, 0444, show_port_id, NULL);
 
 static ssize_t show_port_no(struct net_bridge_port *p, char *buf)
 {
 	return sprintf(buf, "0x%x\n", p->port_no);
 }
 
-static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL);
+static BRPORT_ATTR(port_no, 0444, show_port_no, NULL);
 
 static ssize_t show_change_ack(struct net_bridge_port *p, char *buf)
 {
 	return sprintf(buf, "%d\n", p->topology_change_ack);
 }
-static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL);
+static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL);
 
 static ssize_t show_config_pending(struct net_bridge_port *p, char *buf)
 {
 	return sprintf(buf, "%d\n", p->config_pending);
 }
-static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL);
+static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL);
 
 static ssize_t show_port_state(struct net_bridge_port *p, char *buf)
 {
 	return sprintf(buf, "%d\n", p->state);
 }
-static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL);
+static BRPORT_ATTR(state, 0444, show_port_state, NULL);
 
 static ssize_t show_message_age_timer(struct net_bridge_port *p,
 					    char *buf)
 {
 	return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer));
 }
-static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL);
+static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL);
 
 static ssize_t show_forward_delay_timer(struct net_bridge_port *p,
 					    char *buf)
 {
 	return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer));
 }
-static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL);
+static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL);
 
 static ssize_t show_hold_timer(struct net_bridge_port *p,
 					    char *buf)
 {
 	return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer));
 }
-static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL);
+static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL);
 
 static int store_flush(struct net_bridge_port *p, unsigned long v)
 {
 	br_fdb_delete_by_port(p->br, p, 0, 0); // Don't delete local entry
 	return 0;
 }
-static BRPORT_ATTR(flush, S_IWUSR, NULL, store_flush);
+static BRPORT_ATTR(flush, 0200, NULL, store_flush);
 
 static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf)
 {
@@ -179,7 +179,7 @@ static int store_group_fwd_mask(struct net_bridge_port *p,
 
 	return 0;
 }
-static BRPORT_ATTR(group_fwd_mask, S_IRUGO | S_IWUSR, show_group_fwd_mask,
+static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask,
 		   store_group_fwd_mask);
 
 BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
@@ -204,7 +204,7 @@ static int store_multicast_router(struct net_bridge_port *p,
 {
 	return br_multicast_set_port_router(p, v);
 }
-static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
+static BRPORT_ATTR(multicast_router, 0644, show_multicast_router,
 		   store_multicast_router);
 
 BRPORT_ATTR_FLAG(multicast_fast_leave, BR_MULTICAST_FAST_LEAVE);
diff --git a/net/can/af_can.c b/net/can/af_can.c
index e899970398a1..2f0d0a72e4b5 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -72,7 +72,7 @@ MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>, "
 MODULE_ALIAS_NETPROTO(PF_CAN);
 
 static int stats_timer __read_mostly = 1;
-module_param(stats_timer, int, S_IRUGO);
+module_param(stats_timer, int, 0444);
 MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)");
 
 static struct kmem_cache *rcv_cache __read_mostly;
diff --git a/net/can/gw.c b/net/can/gw.c
index 08e97668d5cf..8d71e199d5b3 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -72,7 +72,7 @@ MODULE_ALIAS(CAN_GW_NAME);
 #define CGW_DEFAULT_HOPS 1
 
 static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS;
-module_param(max_hops, uint, S_IRUGO);
+module_param(max_hops, uint, 0444);
 MODULE_PARM_DESC(max_hops,
 		 "maximum " CAN_GW_NAME " routing hops for CAN frames "
 		 "(valid values: " __stringify(CGW_MIN_HOPS) "-"
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4d4c82229e9e..4adf07826f4a 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -54,7 +54,7 @@ static const struct kernel_param_ops param_ops_supported_features = {
 	.get = param_get_supported_features,
 };
 module_param_cb(supported_features, &param_ops_supported_features, NULL,
-		S_IRUGO);
+		0444);
 
 const char *ceph_msg_type_name(int type)
 {
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 65b51e778782..dd6ae431d038 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -315,12 +315,12 @@ static int __net_init dev_proc_net_init(struct net *net)
 {
 	int rc = -ENOMEM;
 
-	if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
+	if (!proc_create("dev", 0444, net->proc_net, &dev_seq_fops))
 		goto out;
-	if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
+	if (!proc_create("softnet_stat", 0444, net->proc_net,
 			 &softnet_seq_fops))
 		goto out_dev;
-	if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
+	if (!proc_create("ptype", 0444, net->proc_net, &ptype_seq_fops))
 		goto out_softnet;
 
 	if (wext_proc_init(net))
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 60a5ad2c33ee..c476f0794132 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -431,7 +431,7 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr,
 	return netdev_store(dev, attr, buf, len, change_group);
 }
 NETDEVICE_SHOW(group, fmt_dec);
-static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store);
+static DEVICE_ATTR(netdev_group, 0644, group_show, group_store);
 
 static int change_proto_down(struct net_device *dev, unsigned long proto_down)
 {
@@ -854,10 +854,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 }
 
 static struct rx_queue_attribute rps_cpus_attribute __ro_after_init
-	= __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+	= __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map);
 
 static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init
-	= __ATTR(rps_flow_cnt, S_IRUGO | S_IWUSR,
+	= __ATTR(rps_flow_cnt, 0644,
 		 show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt);
 #endif /* CONFIG_RPS */
 
@@ -1154,7 +1154,7 @@ static ssize_t bql_set_hold_time(struct netdev_queue *queue,
 }
 
 static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
-	= __ATTR(hold_time, S_IRUGO | S_IWUSR,
+	= __ATTR(hold_time, 0644,
 		 bql_show_hold_time, bql_set_hold_time);
 
 static ssize_t bql_show_inflight(struct netdev_queue *queue,
@@ -1166,7 +1166,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue,
 }
 
 static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
-	__ATTR(inflight, S_IRUGO, bql_show_inflight, NULL);
+	__ATTR(inflight, 0444, bql_show_inflight, NULL);
 
 #define BQL_ATTR(NAME, FIELD)						\
 static ssize_t bql_show_ ## NAME(struct netdev_queue *queue,		\
@@ -1182,7 +1182,7 @@ static ssize_t bql_set_ ## NAME(struct netdev_queue *queue,		\
 }									\
 									\
 static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \
-	= __ATTR(NAME, S_IRUGO | S_IWUSR,				\
+	= __ATTR(NAME, 0644,				\
 		 bql_show_ ## NAME, bql_set_ ## NAME)
 
 BQL_ATTR(limit, limit);
diff --git a/net/core/sock.c b/net/core/sock.c
index e689496dfd8a..8cee2920a47f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3455,7 +3455,7 @@ static const struct file_operations proto_seq_fops = {
 
 static __net_init int proto_init_net(struct net *net)
 {
-	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
+	if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 2ee8306c23e3..32751602767f 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -2383,7 +2383,7 @@ static int __init decnet_init(void)
 	dev_add_pack(&dn_dix_packet_type);
 	register_netdevice_notifier(&dn_dev_notifier);
 
-	proc_create("decnet", S_IRUGO, init_net.proc_net, &dn_socket_seq_fops);
+	proc_create("decnet", 0444, init_net.proc_net, &dn_socket_seq_fops);
 	dn_register_sysctl();
 out:
 	return rc;
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index c9f5e1ebb9c8..c03b046478c3 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -1424,7 +1424,7 @@ void __init dn_dev_init(void)
 	rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
 			     NULL, dn_nl_dump_ifaddr, 0);
 
-	proc_create("decnet_dev", S_IRUGO, init_net.proc_net, &dn_dev_seq_fops);
+	proc_create("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_fops);
 
 #ifdef CONFIG_SYSCTL
 	{
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 6e37d9e6345e..13156165afa3 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -608,7 +608,7 @@ static const struct file_operations dn_neigh_seq_fops = {
 void __init dn_neigh_init(void)
 {
 	neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table);
-	proc_create("decnet_neigh", S_IRUGO, init_net.proc_net,
+	proc_create("decnet_neigh", 0444, init_net.proc_net,
 		    &dn_neigh_seq_fops);
 }
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index ef20b8e31669..eca0cc6b761f 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1918,7 +1918,7 @@ void __init dn_route_init(void)
 
 	dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
 
-	proc_create("decnet_cache", S_IRUGO, init_net.proc_net,
+	proc_create("decnet_cache", 0444, init_net.proc_net,
 		    &dn_rt_cache_seq_fops);
 
 #ifdef CONFIG_DECNET_ROUTER
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index e1d4d898a007..8396705deffc 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -38,7 +38,7 @@ MODULE_AUTHOR("Wang Lei");
 MODULE_LICENSE("GPL");
 
 unsigned int dns_resolver_debug;
-module_param_named(debug, dns_resolver_debug, uint, S_IWUSR | S_IRUGO);
+module_param_named(debug, dns_resolver_debug, uint, 0644);
 MODULE_PARM_DESC(debug, "DNS Resolver debugging mask");
 
 const struct cred *dns_resolver_cache;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 7dc9de8444a9..4c6ba0fb2630 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1434,7 +1434,7 @@ static const struct file_operations arp_seq_fops = {
 
 static int __net_init arp_net_init(struct net *net)
 {
-	if (!proc_create("arp", S_IRUGO, net->proc_net, &arp_seq_fops))
+	if (!proc_create("arp", 0444, net->proc_net, &arp_seq_fops))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 62243a8abf92..fac0b73e24d1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2722,14 +2722,14 @@ static const struct file_operations fib_route_fops = {
 
 int __net_init fib_proc_init(struct net *net)
 {
-	if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops))
+	if (!proc_create("fib_trie", 0444, net->proc_net, &fib_trie_fops))
 		goto out1;
 
-	if (!proc_create("fib_triestat", S_IRUGO, net->proc_net,
+	if (!proc_create("fib_triestat", 0444, net->proc_net,
 			 &fib_triestat_fops))
 		goto out2;
 
-	if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops))
+	if (!proc_create("route", 0444, net->proc_net, &fib_route_fops))
 		goto out3;
 
 	return 0;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index c2743763777e..f17cd83ba164 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2993,10 +2993,10 @@ static int __net_init igmp_net_init(struct net *net)
 	struct proc_dir_entry *pde;
 	int err;
 
-	pde = proc_create("igmp", S_IRUGO, net->proc_net, &igmp_mc_seq_fops);
+	pde = proc_create("igmp", 0444, net->proc_net, &igmp_mc_seq_fops);
 	if (!pde)
 		goto out_igmp;
-	pde = proc_create("mcfilter", S_IRUGO, net->proc_net,
+	pde = proc_create("mcfilter", 0444, net->proc_net,
 			  &igmp_mcf_seq_fops);
 	if (!pde)
 		goto out_mcfilter;
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f75802ad960f..43f620feb1c4 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1369,7 +1369,7 @@ static int __init ip_auto_config(void)
 	unsigned int i;
 
 #ifdef CONFIG_PROC_FS
-	proc_create("pnp", S_IRUGO, init_net.proc_net, &pnp_seq_fops);
+	proc_create("pnp", 0444, init_net.proc_net, &pnp_seq_fops);
 #endif /* CONFIG_PROC_FS */
 
 	if (!ic_enable)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 0fc88fa7a4dc..31b4cca588d0 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -250,7 +250,7 @@ clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
 
 		/* create proc dir entry */
 		sprintf(buffer, "%pI4", &ip);
-		c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
+		c->pde = proc_create_data(buffer, 0600,
 					  cn->procdir,
 					  &clusterip_proc_fops, c);
 		if (!c->pde) {
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 0164def9c808..1f24bc8273a0 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1177,7 +1177,7 @@ static struct ping_seq_afinfo ping_v4_seq_afinfo = {
 int ping_proc_register(struct net *net, struct ping_seq_afinfo *afinfo)
 {
 	struct proc_dir_entry *p;
-	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+	p = proc_create_data(afinfo->name, 0444, net->proc_net,
 			     afinfo->seq_fops, afinfo);
 	if (!p)
 		return -ENOMEM;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index d97e83b2dd33..80de2e659dcb 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -521,12 +521,12 @@ static const struct file_operations netstat_seq_fops = {
 
 static __net_init int ip_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat", S_IRUGO, net->proc_net,
+	if (!proc_create("sockstat", 0444, net->proc_net,
 			 &sockstat_seq_fops))
 		goto out_sockstat;
-	if (!proc_create("netstat", S_IRUGO, net->proc_net, &netstat_seq_fops))
+	if (!proc_create("netstat", 0444, net->proc_net, &netstat_seq_fops))
 		goto out_netstat;
-	if (!proc_create("snmp", S_IRUGO, net->proc_net, &snmp_seq_fops))
+	if (!proc_create("snmp", 0444, net->proc_net, &snmp_seq_fops))
 		goto out_snmp;
 
 	return 0;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 720bef7da2f6..0ee2501a9027 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1140,7 +1140,7 @@ static const struct file_operations raw_seq_fops = {
 
 static __net_init int raw_init_net(struct net *net)
 {
-	if (!proc_create("raw", S_IRUGO, net->proc_net, &raw_seq_fops))
+	if (!proc_create("raw", 0444, net->proc_net, &raw_seq_fops))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 4ac5728689f5..ce9bd5380d21 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -379,12 +379,12 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
 {
 	struct proc_dir_entry *pde;
 
-	pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
+	pde = proc_create("rt_cache", 0444, net->proc_net,
 			  &rt_cache_seq_fops);
 	if (!pde)
 		goto err1;
 
-	pde = proc_create("rt_cache", S_IRUGO,
+	pde = proc_create("rt_cache", 0444,
 			  net->proc_net_stat, &rt_cpu_seq_fops);
 	if (!pde)
 		goto err2;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2c6aec2643e8..fec8b1fd7b63 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2215,7 +2215,7 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
 	afinfo->seq_ops.next		= tcp_seq_next;
 	afinfo->seq_ops.stop		= tcp_seq_stop;
 
-	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+	p = proc_create_data(afinfo->name, 0444, net->proc_net,
 			     afinfo->seq_fops, afinfo);
 	if (!p)
 		rc = -ENOMEM;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c6dc019bc64b..fb8f3a36bd14 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2673,7 +2673,7 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
 	afinfo->seq_ops.next		= udp_seq_next;
 	afinfo->seq_ops.stop		= udp_seq_stop;
 
-	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+	p = proc_create_data(afinfo->name, 0444, net->proc_net,
 			     afinfo->seq_fops, afinfo);
 	if (!p)
 		rc = -ENOMEM;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 6fd4bbdc444f..9ca90cd9fba4 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4281,7 +4281,7 @@ static const struct file_operations if6_fops = {
 
 static int __net_init if6_proc_net_init(struct net *net)
 {
-	if (!proc_create("if_inet6", S_IRUGO, net->proc_net, &if6_fops))
+	if (!proc_create("if_inet6", 0444, net->proc_net, &if6_fops))
 		return -ENOMEM;
 	return 0;
 }
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index d580d4d456a5..bbcabbba9bd8 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -544,7 +544,7 @@ static const struct file_operations ac6_seq_fops = {
 
 int __net_init ac6_proc_init(struct net *net)
 {
-	if (!proc_create("anycast6", S_IRUGO, net->proc_net, &ac6_seq_fops))
+	if (!proc_create("anycast6", 0444, net->proc_net, &ac6_seq_fops))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 6ddf52282894..f75b06ba8325 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -844,7 +844,7 @@ static const struct file_operations ip6fl_seq_fops = {
 
 static int __net_init ip6_flowlabel_proc_init(struct net *net)
 {
-	if (!proc_create("ip6_flowlabel", S_IRUGO, net->proc_net,
+	if (!proc_create("ip6_flowlabel", 0444, net->proc_net,
 			 &ip6fl_seq_fops))
 		return -ENOMEM;
 	return 0;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index d1a0cefac273..1e4c2b6ebd78 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2921,9 +2921,9 @@ static int __net_init igmp6_proc_init(struct net *net)
 	int err;
 
 	err = -ENOMEM;
-	if (!proc_create("igmp6", S_IRUGO, net->proc_net, &igmp6_mc_seq_fops))
+	if (!proc_create("igmp6", 0444, net->proc_net, &igmp6_mc_seq_fops))
 		goto out;
-	if (!proc_create("mcfilter6", S_IRUGO, net->proc_net,
+	if (!proc_create("mcfilter6", 0444, net->proc_net,
 			 &igmp6_mcf_seq_fops))
 		goto out_proc_net_igmp6;
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 1678cf037688..f9891fa672f3 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -290,7 +290,7 @@ int snmp6_register_dev(struct inet6_dev *idev)
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 
-	p = proc_create_data(idev->dev->name, S_IRUGO,
+	p = proc_create_data(idev->dev->name, 0444,
 			     net->mib.proc_net_devsnmp6,
 			     &snmp6_dev_seq_fops, idev);
 	if (!p)
@@ -314,11 +314,11 @@ int snmp6_unregister_dev(struct inet6_dev *idev)
 
 static int __net_init ipv6_proc_init_net(struct net *net)
 {
-	if (!proc_create("sockstat6", S_IRUGO, net->proc_net,
+	if (!proc_create("sockstat6", 0444, net->proc_net,
 			 &sockstat6_seq_fops))
 		return -ENOMEM;
 
-	if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops))
+	if (!proc_create("snmp6", 0444, net->proc_net, &snmp6_seq_fops))
 		goto proc_snmp6_fail;
 
 	net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 10a4ac4933b7..b5e5de732494 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1318,7 +1318,7 @@ static const struct file_operations raw6_seq_fops = {
 
 static int __net_init raw6_init_net(struct net *net)
 {
-	if (!proc_create("raw6", S_IRUGO, net->proc_net, &raw6_seq_fops))
+	if (!proc_create("raw6", 0444, net->proc_net, &raw6_seq_fops))
 		return -ENOMEM;
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a2ed9fdd58d4..1d0eaa69874d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5067,7 +5067,7 @@ static int __net_init ip6_route_net_init_late(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
 	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
-	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
+	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
 #endif
 	return 0;
 }
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 2c1c8b3e4452..4c2e9907f254 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -269,7 +269,7 @@ static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo)
 	struct proc_dir_entry *p;
 	int rc = 0;
 
-	p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net,
+	p = proc_create_data(muxinfo->name, 0444, net->proc_net,
 			     muxinfo->seq_fops, muxinfo);
 	if (!p)
 		rc = -ENOMEM;
@@ -406,7 +406,7 @@ static int kcm_proc_init_net(struct net *net)
 {
 	int err;
 
-	if (!proc_create("kcm_stats", S_IRUGO, net->proc_net,
+	if (!proc_create("kcm_stats", 0444, net->proc_net,
 			 &kcm_stats_seq_fops)) {
 		err = -ENOMEM;
 		goto out_kcm_stats;
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 977bca659787..f24504efe729 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1742,7 +1742,7 @@ static __net_init int pppol2tp_init_net(struct net *net)
 	struct proc_dir_entry *pde;
 	int err = 0;
 
-	pde = proc_create("pppol2tp", S_IRUGO, net->proc_net,
+	pde = proc_create("pppol2tp", 0444, net->proc_net,
 			  &pppol2tp_proc_fops);
 	if (!pde) {
 		err = -ENOMEM;
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index 66821e8a2b7a..62ea0aed94b4 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -249,11 +249,11 @@ int __init llc_proc_init(void)
 	if (!llc_proc_dir)
 		goto out;
 
-	p = proc_create("socket", S_IRUGO, llc_proc_dir, &llc_seq_socket_fops);
+	p = proc_create("socket", 0444, llc_proc_dir, &llc_seq_socket_fops);
 	if (!p)
 		goto out_socket;
 
-	p = proc_create("core", S_IRUGO, llc_proc_dir, &llc_seq_core_fops);
+	p = proc_create("core", 0444, llc_proc_dir, &llc_seq_core_fops);
 	if (!p)
 		goto out_core;
 
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 9766c1cc4b0a..8221bc5582ab 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -690,7 +690,7 @@ minstrel_alloc(struct ieee80211_hw *hw, struct dentry *debugfsdir)
 #ifdef CONFIG_MAC80211_DEBUGFS
 	mp->fixed_rate_idx = (u32) -1;
 	mp->dbg_fixed_rate = debugfs_create_u32("fixed_rate_idx",
-			S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx);
+			0666, debugfsdir, &mp->fixed_rate_idx);
 #endif
 
 	minstrel_init_cck_rates(mp);
diff --git a/net/mac80211/rc80211_minstrel_debugfs.c b/net/mac80211/rc80211_minstrel_debugfs.c
index 36fc971deb86..9ad7d63d3e5b 100644
--- a/net/mac80211/rc80211_minstrel_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_debugfs.c
@@ -214,11 +214,11 @@ minstrel_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
 {
 	struct minstrel_sta_info *mi = priv_sta;
 
-	mi->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, mi,
-			&minstrel_stat_fops);
+	mi->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, mi,
+					    &minstrel_stat_fops);
 
-	mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO, dir,
-			mi, &minstrel_stat_csv_fops);
+	mi->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, mi,
+						&minstrel_stat_csv_fops);
 }
 
 void
diff --git a/net/mac80211/rc80211_minstrel_ht_debugfs.c b/net/mac80211/rc80211_minstrel_ht_debugfs.c
index 7d969e300fb3..bfcc03152dc6 100644
--- a/net/mac80211/rc80211_minstrel_ht_debugfs.c
+++ b/net/mac80211/rc80211_minstrel_ht_debugfs.c
@@ -303,10 +303,10 @@ minstrel_ht_add_sta_debugfs(void *priv, void *priv_sta, struct dentry *dir)
 {
 	struct minstrel_ht_sta_priv *msp = priv_sta;
 
-	msp->dbg_stats = debugfs_create_file("rc_stats", S_IRUGO, dir, msp,
-			&minstrel_ht_stat_fops);
-	msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", S_IRUGO,
-			     dir, msp, &minstrel_ht_stat_csv_fops);
+	msp->dbg_stats = debugfs_create_file("rc_stats", 0444, dir, msp,
+					     &minstrel_ht_stat_fops);
+	msp->dbg_stats_csv = debugfs_create_file("rc_stats_csv", 0444, dir, msp,
+						 &minstrel_ht_stat_csv_fops);
 }
 
 void
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index 496ce173f0c1..cc11bf890eb9 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -33,7 +33,7 @@ MODULE_ALIAS("ip_conntrack_netbios_ns");
 MODULE_ALIAS_NFCT_HELPER("netbios_ns");
 
 static unsigned int timeout __read_mostly = 3;
-module_param(timeout, uint, S_IRUSR);
+module_param(timeout, uint, 0400);
 MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
 
 static struct nf_conntrack_expect_policy exp_policy = {
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
index 87b95a2c270c..1b18f43ad226 100644
--- a/net/netfilter/nf_conntrack_snmp.c
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -26,7 +26,7 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS_NFCT_HELPER("snmp");
 
 static unsigned int timeout __read_mostly = 30;
-module_param(timeout, uint, S_IRUSR);
+module_param(timeout, uint, 0400);
 MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
 
 int (*nf_nat_snmp_hook)(struct sk_buff *skb,
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 3cdce391362e..98844c87d01e 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -495,7 +495,7 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
 	if (uid_valid(root_uid) && gid_valid(root_gid))
 		proc_set_user(pde, root_uid, root_gid);
 
-	pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
+	pde = proc_create("nf_conntrack", 0444, net->proc_net_stat,
 			  &ct_cpu_seq_fops);
 	if (!pde)
 		goto out_stat_nf_conntrack;
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 1ba3da51050d..a964e4d356cc 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -549,7 +549,7 @@ static int __net_init nf_log_net_init(struct net *net)
 	int ret = -ENOMEM;
 
 #ifdef CONFIG_PROC_FS
-	if (!proc_create("nf_log", S_IRUGO,
+	if (!proc_create("nf_log", 0444,
 			 net->nf.proc_netfilter, &nflog_file_ops))
 		return ret;
 #endif
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 64b875e452ca..8f16fd27132d 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -325,7 +325,7 @@ static const struct file_operations synproxy_cpu_seq_fops = {
 
 static int __net_init synproxy_proc_init(struct net *net)
 {
-	if (!proc_create("synproxy", S_IRUGO, net->proc_net_stat,
+	if (!proc_create("synproxy", 0444, net->proc_net_stat,
 			 &synproxy_cpu_seq_fops))
 		return -ENOMEM;
 	return 0;
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index 1ac6600bfafd..5ee859193783 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -132,7 +132,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
 		ret = -ENOMEM;
 		goto out_free_timer;
 	}
-	info->timer->attr.attr.mode = S_IRUGO;
+	info->timer->attr.attr.mode = 0444;
 	info->timer->attr.show = idletimer_tg_show;
 
 	ret = sysfs_create_file(idletimer_tg_kobj, &info->timer->attr.attr);
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 486dd24da78b..434e35ce940b 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -51,8 +51,8 @@ static unsigned int ip_list_gid __read_mostly;
 module_param(ip_list_tot, uint, 0400);
 module_param(ip_list_hash_size, uint, 0400);
 module_param(ip_list_perms, uint, 0400);
-module_param(ip_list_uid, uint, S_IRUGO | S_IWUSR);
-module_param(ip_list_gid, uint, S_IRUGO | S_IWUSR);
+module_param(ip_list_uid, uint, 0644);
+module_param(ip_list_gid, uint, 0644);
 MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list");
 MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs");
 MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files");
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 35bb6807927f..4221d98a314b 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1450,9 +1450,9 @@ static int __init nr_proto_init(void)
 
 	nr_loopback_init();
 
-	proc_create("nr", S_IRUGO, init_net.proc_net, &nr_info_fops);
-	proc_create("nr_neigh", S_IRUGO, init_net.proc_net, &nr_neigh_fops);
-	proc_create("nr_nodes", S_IRUGO, init_net.proc_net, &nr_nodes_fops);
+	proc_create("nr", 0444, init_net.proc_net, &nr_info_fops);
+	proc_create("nr_neigh", 0444, init_net.proc_net, &nr_neigh_fops);
+	proc_create("nr_nodes", 0444, init_net.proc_net, &nr_nodes_fops);
 out:
 	return rc;
 fail:
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 5170373b797c..9ff5e0a76593 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1567,12 +1567,12 @@ static int __init rose_proto_init(void)
 
 	rose_add_loopback_neigh();
 
-	proc_create("rose", S_IRUGO, init_net.proc_net, &rose_info_fops);
-	proc_create("rose_neigh", S_IRUGO, init_net.proc_net,
+	proc_create("rose", 0444, init_net.proc_net, &rose_info_fops);
+	proc_create("rose_neigh", 0444, init_net.proc_net,
 		    &rose_neigh_fops);
-	proc_create("rose_nodes", S_IRUGO, init_net.proc_net,
+	proc_create("rose_nodes", 0444, init_net.proc_net,
 		    &rose_nodes_fops);
-	proc_create("rose_routes", S_IRUGO, init_net.proc_net,
+	proc_create("rose_routes", 0444, init_net.proc_net,
 		    &rose_routes_fops);
 out:
 	return rc;
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 0c9c18aa7c77..9e1c2c6b6a67 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -32,7 +32,7 @@ MODULE_LICENSE("GPL");
 MODULE_ALIAS_NETPROTO(PF_RXRPC);
 
 unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO;
-module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO);
+module_param_named(debug, rxrpc_debug, uint, 0644);
 MODULE_PARM_DESC(debug, "RxRPC debugging mask");
 
 static struct proto rxrpc_proto;
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 17d0155d9de3..1d9ccc6dab2b 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -450,17 +450,17 @@ int __net_init sctp_proc_init(struct net *net)
 	net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net);
 	if (!net->sctp.proc_net_sctp)
 		return -ENOMEM;
-	if (!proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_snmp_seq_fops))
+	if (!proc_create("snmp", 0444, net->sctp.proc_net_sctp,
+			 &sctp_snmp_seq_fops))
 		goto cleanup;
-	if (!proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_eps_seq_fops))
+	if (!proc_create("eps", 0444, net->sctp.proc_net_sctp,
+			 &sctp_eps_seq_fops))
 		goto cleanup;
-	if (!proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_assocs_seq_fops))
+	if (!proc_create("assocs", 0444, net->sctp.proc_net_sctp,
+			 &sctp_assocs_seq_fops))
 		goto cleanup;
-	if (!proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_remaddr_seq_fops))
+	if (!proc_create("remaddr", 0444, net->sctp.proc_net_sctp,
+			 &sctp_remaddr_seq_fops))
 		goto cleanup;
 	return 0;
 
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 26531193fce4..5089dbb96d58 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1375,7 +1375,7 @@ static int create_use_gss_proxy_proc_entry(struct net *net)
 	struct proc_dir_entry **p = &sn->use_gssp_proc;
 
 	sn->use_gss_proxy = -1;
-	*p = proc_create_data("use-gss-proxy", S_IFREG|S_IRUSR|S_IWUSR,
+	*p = proc_create_data("use-gss-proxy", S_IFREG | 0600,
 			      sn->proc_net_rpc,
 			      &use_gss_proxy_ops, net);
 	if (!*p)
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 8a7e1c774f9c..c536cc24b3d1 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1621,20 +1621,20 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 	if (cd->procfs == NULL)
 		goto out_nomem;
 
-	p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
+	p = proc_create_data("flush", S_IFREG | 0600,
 			     cd->procfs, &cache_flush_operations_procfs, cd);
 	if (p == NULL)
 		goto out_nomem;
 
 	if (cd->cache_request || cd->cache_parse) {
-		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
-				cd->procfs, &cache_file_operations_procfs, cd);
+		p = proc_create_data("channel", S_IFREG | 0600, cd->procfs,
+				     &cache_file_operations_procfs, cd);
 		if (p == NULL)
 			goto out_nomem;
 	}
 	if (cd->cache_show) {
-		p = proc_create_data("content", S_IFREG|S_IRUSR,
-				cd->procfs, &content_file_operations_procfs, cd);
+		p = proc_create_data("content", S_IFREG | 0400, cd->procfs,
+				     &content_file_operations_procfs, cd);
 		if (p == NULL)
 			goto out_nomem;
 	}
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index e980d2a493de..45a033329cd4 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -139,7 +139,7 @@ rpc_clnt_debugfs_register(struct rpc_clnt *clnt)
 		return;
 
 	/* make tasks file */
-	if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs,
+	if (!debugfs_create_file("tasks", S_IFREG | 0400, clnt->cl_debugfs,
 				 clnt, &tasks_fops))
 		goto out_err;
 
@@ -241,7 +241,7 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
 		return;
 
 	/* make tasks file */
-	if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs,
+	if (!debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs,
 				 xprt, &xprt_info_fops)) {
 		debugfs_remove_recursive(xprt->debugfs);
 		xprt->debugfs = NULL;
@@ -317,7 +317,7 @@ inject_fault_dir(struct dentry *topdir)
 	if (!faultdir)
 		return NULL;
 
-	if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
+	if (!debugfs_create_file("disconnect", S_IFREG | 0400, faultdir,
 				 NULL, &fault_disconnect_fops))
 		return NULL;
 
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index fc97fc3ed637..0f08934b2cea 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -820,13 +820,13 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
 {
 	struct dentry *dentry;
 	struct inode *dir = d_inode(parent);
-	umode_t umode = S_IFIFO | S_IRUSR | S_IWUSR;
+	umode_t umode = S_IFIFO | 0600;
 	int err;
 
 	if (pipe->ops->upcall == NULL)
-		umode &= ~S_IRUGO;
+		umode &= ~0444;
 	if (pipe->ops->downcall == NULL)
-		umode &= ~S_IWUGO;
+		umode &= ~0222;
 
 	inode_lock_nested(dir, I_MUTEX_PARENT);
 	dentry = __rpc_lookup_create_exclusive(parent, name);
@@ -1035,7 +1035,7 @@ static const struct rpc_filelist authfiles[] = {
 	[RPCAUTH_info] = {
 		.name = "info",
 		.i_fop = &rpc_info_operations,
-		.mode = S_IFREG | S_IRUSR,
+		.mode = S_IFREG | 0400,
 	},
 };
 
@@ -1068,8 +1068,8 @@ struct dentry *rpc_create_client_dir(struct dentry *dentry,
 {
 	struct dentry *ret;
 
-	ret = rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL,
-			rpc_clntdir_populate, rpc_client);
+	ret = rpc_mkdir_populate(dentry, name, 0555, NULL,
+				 rpc_clntdir_populate, rpc_client);
 	if (!IS_ERR(ret)) {
 		rpc_client->cl_pipedir_objects.pdh_dentry = ret;
 		rpc_create_pipe_dir_objects(&rpc_client->cl_pipedir_objects);
@@ -1096,17 +1096,17 @@ static const struct rpc_filelist cache_pipefs_files[3] = {
 	[0] = {
 		.name = "channel",
 		.i_fop = &cache_file_operations_pipefs,
-		.mode = S_IFREG|S_IRUSR|S_IWUSR,
+		.mode = S_IFREG | 0600,
 	},
 	[1] = {
 		.name = "content",
 		.i_fop = &content_file_operations_pipefs,
-		.mode = S_IFREG|S_IRUSR,
+		.mode = S_IFREG | 0400,
 	},
 	[2] = {
 		.name = "flush",
 		.i_fop = &cache_flush_operations_pipefs,
-		.mode = S_IFREG|S_IRUSR|S_IWUSR,
+		.mode = S_IFREG | 0600,
 	},
 };
 
@@ -1164,39 +1164,39 @@ enum {
 static const struct rpc_filelist files[] = {
 	[RPCAUTH_lockd] = {
 		.name = "lockd",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_mount] = {
 		.name = "mount",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_nfs] = {
 		.name = "nfs",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_portmap] = {
 		.name = "portmap",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_statd] = {
 		.name = "statd",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_nfsd4_cb] = {
 		.name = "nfsd4_cb",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_cache] = {
 		.name = "cache",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_nfsd] = {
 		.name = "nfsd",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 	[RPCAUTH_gssd] = {
 		.name = "gssd",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 };
 
@@ -1261,7 +1261,7 @@ EXPORT_SYMBOL_GPL(rpc_put_sb_net);
 static const struct rpc_filelist gssd_dummy_clnt_dir[] = {
 	[0] = {
 		.name = "clntXX",
-		.mode = S_IFDIR | S_IRUGO | S_IXUGO,
+		.mode = S_IFDIR | 0555,
 	},
 };
 
@@ -1310,7 +1310,7 @@ static const struct rpc_filelist gssd_dummy_info_file[] = {
 	[0] = {
 		.name = "info",
 		.i_fop = &rpc_dummy_info_operations,
-		.mode = S_IFREG | S_IRUSR,
+		.mode = S_IFREG | 0400,
 	},
 };
 
@@ -1397,7 +1397,7 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_d_op = &simple_dentry_operations;
 	sb->s_time_gran = 1;
 
-	inode = rpc_get_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
+	inode = rpc_get_inode(sb, S_IFDIR | 0555);
 	sb->s_root = root = d_make_root(inode);
 	if (!root)
 		return -ENOMEM;
diff --git a/net/wireless/wext-proc.c b/net/wireless/wext-proc.c
index 5511f989ef47..b4c464594a5e 100644
--- a/net/wireless/wext-proc.c
+++ b/net/wireless/wext-proc.c
@@ -142,7 +142,7 @@ static const struct file_operations wireless_seq_fops = {
 int __net_init wext_proc_init(struct net *net)
 {
 	/* Create /proc/net/wireless entry */
-	if (!proc_create("wireless", S_IRUGO, net->proc_net,
+	if (!proc_create("wireless", 0444, net->proc_net,
 			 &wireless_seq_fops))
 		return -ENOMEM;
 
diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c
index 0917f047f2cf..64b415e93f6a 100644
--- a/net/x25/x25_proc.c
+++ b/net/x25/x25_proc.c
@@ -212,16 +212,16 @@ int __init x25_proc_init(void)
 	if (!proc_mkdir("x25", init_net.proc_net))
 		return -ENOMEM;
 
-	if (!proc_create("x25/route", S_IRUGO, init_net.proc_net,
-			&x25_seq_route_fops))
+	if (!proc_create("x25/route", 0444, init_net.proc_net,
+			 &x25_seq_route_fops))
 		goto out;
 
-	if (!proc_create("x25/socket", S_IRUGO, init_net.proc_net,
-			&x25_seq_socket_fops))
+	if (!proc_create("x25/socket", 0444, init_net.proc_net,
+			 &x25_seq_socket_fops))
 		goto out;
 
-	if (!proc_create("x25/forward", S_IRUGO, init_net.proc_net,
-			&x25_seq_forward_fops))
+	if (!proc_create("x25/forward", 0444, init_net.proc_net,
+			 &x25_seq_forward_fops))
 		goto out;
 	return 0;
 
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 6d5f85f4e672..ed06903cd84d 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -79,7 +79,7 @@ static const struct file_operations xfrm_statistics_seq_fops = {
 
 int __net_init xfrm_proc_init(struct net *net)
 {
-	if (!proc_create("xfrm_stat", S_IRUGO, net->proc_net,
+	if (!proc_create("xfrm_stat", 0444, net->proc_net,
 			 &xfrm_statistics_seq_fops))
 		return -ENOMEM;
 	return 0;
-- 
cgit v1.2.3


From 855aeba34047f53e3665b0c3bcac80fe87ee2f7b Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Mar 2018 12:28:47 +0300
Subject: net: Convert rpcsec_gss_net_ops

These pernet_operations initialize and destroy sunrpc_net_id
refered per-net items. Only used global list is cache_list,
and accesses already serialized.

sunrpc_destroy_cache_detail() check for list_empty() without
cache_list_lock, but when it's called from unregister_pernet_subsys(),
there can't be callers in parallel, so we won't miss list_empty()
in this case.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Anna Schumaker <Anna.Schumaker@netapp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sunrpc/auth_gss/auth_gss.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 9463af4b32e8..44f939cb6bc8 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -2063,6 +2063,7 @@ static __net_exit void rpcsec_gss_exit_net(struct net *net)
 static struct pernet_operations rpcsec_gss_net_ops = {
 	.init = rpcsec_gss_init_net,
 	.exit = rpcsec_gss_exit_net,
+	.async = true,
 };
 
 /*
-- 
cgit v1.2.3


From 5e804a6077dccf154047a1ffe5b5232dce579659 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Mon, 26 Mar 2018 12:28:55 +0300
Subject: net: Convert sunrpc_net_ops

These pernet_operations look similar to rpcsec_gss_net_ops,
they just create and destroy another caches. So, they also
can be async.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Anna Schumaker <Anna.Schumaker@netapp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sunrpc/sunrpc_syms.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 56f9eff74150..68287e921847 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -79,6 +79,7 @@ static struct pernet_operations sunrpc_net_ops = {
 	.exit = sunrpc_exit_net,
 	.id = &sunrpc_net_id,
 	.size = sizeof(struct sunrpc_net),
+	.async = true,
 };
 
 static int __init
-- 
cgit v1.2.3


From bc67a0daf8f3bc6fa8fcb68090f3c444de7f951c Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Mon, 26 Mar 2018 15:01:31 +0300
Subject: ipmr: Make vif fib notifiers common

The fib-notifiers are tightly coupled with the vif_device which is
already common. Move the notifier struct definition and helpers to the
common file; Currently they're only used by ipmr.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 31 +++++--------------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index f6be5db16da2..bb1a0655f8e4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -650,18 +650,8 @@ static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
 					struct vif_device *vif,
 					vifi_t vif_index, u32 tb_id)
 {
-	struct vif_entry_notifier_info info = {
-		.info = {
-			.family = RTNL_FAMILY_IPMR,
-			.net = net,
-		},
-		.dev = vif->dev,
-		.vif_index = vif_index,
-		.vif_flags = vif->flags,
-		.tb_id = tb_id,
-	};
-
-	return call_fib_notifier(nb, net, event_type, &info.info);
+	return mr_call_vif_notifier(nb, net, RTNL_FAMILY_IPMR, event_type,
+				    vif, vif_index, tb_id);
 }
 
 static int call_ipmr_vif_entry_notifiers(struct net *net,
@@ -669,20 +659,9 @@ static int call_ipmr_vif_entry_notifiers(struct net *net,
 					 struct vif_device *vif,
 					 vifi_t vif_index, u32 tb_id)
 {
-	struct vif_entry_notifier_info info = {
-		.info = {
-			.family = RTNL_FAMILY_IPMR,
-			.net = net,
-		},
-		.dev = vif->dev,
-		.vif_index = vif_index,
-		.vif_flags = vif->flags,
-		.tb_id = tb_id,
-	};
-
-	ASSERT_RTNL();
-	net->ipv4.ipmr_seq++;
-	return call_fib_notifiers(net, event_type, &info.info);
+	return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
+				     vif, vif_index, tb_id,
+				     &net->ipv4.ipmr_seq);
 }
 
 static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
-- 
cgit v1.2.3


From 54c4cad97b8fd414909b78d4274a6797baa52b3b Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Mon, 26 Mar 2018 15:01:32 +0300
Subject: ipmr: Make MFC fib notifiers common

Like vif notifications, move the notifier struct for MFC as well as its
helpers into a common file; Currently they're only used by ipmr.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index bb1a0655f8e4..470956d2d8ad 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -669,34 +669,16 @@ static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
 					enum fib_event_type event_type,
 					struct mfc_cache *mfc, u32 tb_id)
 {
-	struct mfc_entry_notifier_info info = {
-		.info = {
-			.family = RTNL_FAMILY_IPMR,
-			.net = net,
-		},
-		.mfc = mfc,
-		.tb_id = tb_id
-	};
-
-	return call_fib_notifier(nb, net, event_type, &info.info);
+	return mr_call_mfc_notifier(nb, net, RTNL_FAMILY_IPMR,
+				    event_type, &mfc->_c, tb_id);
 }
 
 static int call_ipmr_mfc_entry_notifiers(struct net *net,
 					 enum fib_event_type event_type,
 					 struct mfc_cache *mfc, u32 tb_id)
 {
-	struct mfc_entry_notifier_info info = {
-		.info = {
-			.family = RTNL_FAMILY_IPMR,
-			.net = net,
-		},
-		.mfc = mfc,
-		.tb_id = tb_id
-	};
-
-	ASSERT_RTNL();
-	net->ipv4.ipmr_seq++;
-	return call_fib_notifiers(net, event_type, &info.info);
+	return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type,
+				     &mfc->_c, tb_id, &net->ipv4.ipmr_seq);
 }
 
 /**
-- 
cgit v1.2.3


From cdc9f9443b5c3a61c7cec807965054ee1fd29acf Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Mon, 26 Mar 2018 15:01:33 +0300
Subject: ipmr: Make ipmr_dump() common

Since all the primitive elements used for the notification done by ipmr
are now common [mr_table, mr_mfc, vif_device] we can refactor the logic
for dumping them to a common file.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c      | 53 ++--------------------------------------------------
 net/ipv4/ipmr_base.c | 42 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 51 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 470956d2d8ad..24c340021aba 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -644,16 +644,6 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 }
 #endif
 
-static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
-					struct net *net,
-					enum fib_event_type event_type,
-					struct vif_device *vif,
-					vifi_t vif_index, u32 tb_id)
-{
-	return mr_call_vif_notifier(nb, net, RTNL_FAMILY_IPMR, event_type,
-				    vif, vif_index, tb_id);
-}
-
 static int call_ipmr_vif_entry_notifiers(struct net *net,
 					 enum fib_event_type event_type,
 					 struct vif_device *vif,
@@ -664,15 +654,6 @@ static int call_ipmr_vif_entry_notifiers(struct net *net,
 				     &net->ipv4.ipmr_seq);
 }
 
-static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
-					struct net *net,
-					enum fib_event_type event_type,
-					struct mfc_cache *mfc, u32 tb_id)
-{
-	return mr_call_mfc_notifier(nb, net, RTNL_FAMILY_IPMR,
-				    event_type, &mfc->_c, tb_id);
-}
-
 static int call_ipmr_mfc_entry_notifiers(struct net *net,
 					 enum fib_event_type event_type,
 					 struct mfc_cache *mfc, u32 tb_id)
@@ -2950,38 +2931,8 @@ static unsigned int ipmr_seq_read(struct net *net)
 
 static int ipmr_dump(struct net *net, struct notifier_block *nb)
 {
-	struct mr_table *mrt;
-	int err;
-
-	err = ipmr_rules_dump(net, nb);
-	if (err)
-		return err;
-
-	ipmr_for_each_table(mrt, net) {
-		struct vif_device *v = &mrt->vif_table[0];
-		struct mr_mfc *mfc;
-		int vifi;
-
-		/* Notifiy on table VIF entries */
-		read_lock(&mrt_lock);
-		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
-			if (!v->dev)
-				continue;
-
-			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
-						     v, vifi, mrt->id);
-		}
-		read_unlock(&mrt_lock);
-
-		/* Notify on table MFC entries */
-		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
-			call_ipmr_mfc_entry_notifier(nb, net,
-						     FIB_EVENT_ENTRY_ADD,
-						     (struct mfc_cache *)mfc,
-						     mrt->id);
-	}
-
-	return 0;
+	return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
+		       ipmr_mr_table_iter, &mrt_lock);
 }
 
 static const struct fib_notifier_ops ipmr_notifier_ops_template = {
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 8ba55bfda817..4fe97723b53f 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -321,3 +321,45 @@ done:
 	return skb->len;
 }
 EXPORT_SYMBOL(mr_rtm_dumproute);
+
+int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
+	    int (*rules_dump)(struct net *net,
+			      struct notifier_block *nb),
+	    struct mr_table *(*mr_iter)(struct net *net,
+					struct mr_table *mrt),
+	    rwlock_t *mrt_lock)
+{
+	struct mr_table *mrt;
+	int err;
+
+	err = rules_dump(net, nb);
+	if (err)
+		return err;
+
+	for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) {
+		struct vif_device *v = &mrt->vif_table[0];
+		struct mr_mfc *mfc;
+		int vifi;
+
+		/* Notifiy on table VIF entries */
+		read_lock(mrt_lock);
+		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
+			if (!v->dev)
+				continue;
+
+			mr_call_vif_notifier(nb, net, family,
+					     FIB_EVENT_VIF_ADD,
+					     v, vifi, mrt->id);
+		}
+		read_unlock(mrt_lock);
+
+		/* Notify on table MFC entries */
+		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
+			mr_call_mfc_notifier(nb, net, family,
+					     FIB_EVENT_ENTRY_ADD,
+					     mfc, mrt->id);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(mr_dump);
-- 
cgit v1.2.3


From 088aa3eec2ce340b5d0f0f54430f5706223d5e45 Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Mon, 26 Mar 2018 15:01:34 +0300
Subject: ip6mr: Support fib notifications

In similar fashion to ipmr, support fib notifications for ip6mr mfc and
vif related events. This would later allow drivers to react to said
notifications and offload the IPv6 mroutes.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 104 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 7345bd6c4b7d..0be2f333e168 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -258,6 +258,16 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 	fib_rules_unregister(net->ipv6.mr6_rules_ops);
 	rtnl_unlock();
 }
+
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR);
+}
+
+static unsigned int ip6mr_rules_seq_read(struct net *net)
+{
+	return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR);
+}
 #else
 #define ip6mr_for_each_table(mrt, net) \
 	for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
@@ -295,6 +305,16 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
 	net->ipv6.mrt6 = NULL;
 	rtnl_unlock();
 }
+
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+{
+	return 0;
+}
+
+static unsigned int ip6mr_rules_seq_read(struct net *net)
+{
+	return 0;
+}
 #endif
 
 static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -653,10 +673,25 @@ failure:
 }
 #endif
 
-/*
- *	Delete a VIF entry
- */
+static int call_ip6mr_vif_entry_notifiers(struct net *net,
+					  enum fib_event_type event_type,
+					  struct vif_device *vif,
+					  mifi_t vif_index, u32 tb_id)
+{
+	return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
+				     vif, vif_index, tb_id,
+				     &net->ipv6.ipmr_seq);
+}
 
+static int call_ip6mr_mfc_entry_notifiers(struct net *net,
+					  enum fib_event_type event_type,
+					  struct mfc6_cache *mfc, u32 tb_id)
+{
+	return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
+				     &mfc->_c, tb_id, &net->ipv6.ipmr_seq);
+}
+
+/* Delete a VIF entry */
 static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 		       struct list_head *head)
 {
@@ -669,6 +704,11 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
 
 	v = &mrt->vif_table[vifi];
 
+	if (VIF_EXISTS(mrt, vifi))
+		call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
+					       FIB_EVENT_VIF_DEL, v, vifi,
+					       mrt->id);
+
 	write_lock_bh(&mrt_lock);
 	dev = v->dev;
 	v->dev = NULL;
@@ -887,6 +927,8 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
 	if (vifi + 1 > mrt->maxvif)
 		mrt->maxvif = vifi + 1;
 	write_unlock_bh(&mrt_lock);
+	call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
+				       v, vifi, mrt->id);
 	return 0;
 }
 
@@ -1175,6 +1217,8 @@ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc,
 	rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params);
 	list_del_rcu(&c->_c.list);
 
+	call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
+				       FIB_EVENT_ENTRY_DEL, c, mrt->id);
 	mr6_netlink_event(mrt, c, RTM_DELROUTE);
 	ip6mr_cache_free(c);
 	return 0;
@@ -1203,21 +1247,63 @@ static int ip6mr_device_event(struct notifier_block *this,
 	return NOTIFY_DONE;
 }
 
+static unsigned int ip6mr_seq_read(struct net *net)
+{
+	ASSERT_RTNL();
+
+	return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net);
+}
+
+static int ip6mr_dump(struct net *net, struct notifier_block *nb)
+{
+	return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
+		       ip6mr_mr_table_iter, &mrt_lock);
+}
+
 static struct notifier_block ip6_mr_notifier = {
 	.notifier_call = ip6mr_device_event
 };
 
-/*
- *	Setup for IP multicast routing
- */
+static const struct fib_notifier_ops ip6mr_notifier_ops_template = {
+	.family		= RTNL_FAMILY_IP6MR,
+	.fib_seq_read	= ip6mr_seq_read,
+	.fib_dump	= ip6mr_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __net_init ip6mr_notifier_init(struct net *net)
+{
+	struct fib_notifier_ops *ops;
+
+	net->ipv6.ipmr_seq = 0;
 
+	ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net);
+	if (IS_ERR(ops))
+		return PTR_ERR(ops);
+
+	net->ipv6.ip6mr_notifier_ops = ops;
+
+	return 0;
+}
+
+static void __net_exit ip6mr_notifier_exit(struct net *net)
+{
+	fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops);
+	net->ipv6.ip6mr_notifier_ops = NULL;
+}
+
+/* Setup for IP multicast routing */
 static int __net_init ip6mr_net_init(struct net *net)
 {
 	int err;
 
+	err = ip6mr_notifier_init(net);
+	if (err)
+		return err;
+
 	err = ip6mr_rules_init(net);
 	if (err < 0)
-		goto fail;
+		goto ip6mr_rules_fail;
 
 #ifdef CONFIG_PROC_FS
 	err = -ENOMEM;
@@ -1235,7 +1321,8 @@ proc_cache_fail:
 proc_vif_fail:
 	ip6mr_rules_exit(net);
 #endif
-fail:
+ip6mr_rules_fail:
+	ip6mr_notifier_exit(net);
 	return err;
 }
 
@@ -1246,6 +1333,7 @@ static void __net_exit ip6mr_net_exit(struct net *net)
 	remove_proc_entry("ip6_mr_vif", net->proc_net);
 #endif
 	ip6mr_rules_exit(net);
+	ip6mr_notifier_exit(net);
 }
 
 static struct pernet_operations ip6mr_net_ops = {
@@ -1337,6 +1425,8 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 		if (!mrtsock)
 			c->_c.mfc_flags |= MFC_STATIC;
 		write_unlock_bh(&mrt_lock);
+		call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
+					       c, mrt->id);
 		mr6_netlink_event(mrt, c, RTM_NEWROUTE);
 		return 0;
 	}
@@ -1388,6 +1478,8 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
 		ip6mr_cache_resolve(net, mrt, uc, c);
 		ip6mr_cache_free(uc);
 	}
+	call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
+				       c, mrt->id);
 	mr6_netlink_event(mrt, c, RTM_NEWROUTE);
 	return 0;
 }
@@ -1424,6 +1516,10 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		spin_lock_bh(&mfc_unres_lock);
 		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
 			list_del(&c->list);
+			call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
+						       FIB_EVENT_ENTRY_DEL,
+						       (struct mfc6_cache *)c,
+						       mrt->id);
 			mr6_netlink_event(mrt, (struct mfc6_cache *)c,
 					  RTM_DELROUTE);
 			ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
-- 
cgit v1.2.3


From d3c07e5b9939a055fa017f200e535ae947eb22ab Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Mon, 26 Mar 2018 15:01:35 +0300
Subject: ip6mr: Add API for default_rule fib

Add the ability to discern whether a given FIB rule notification relates
to the default rule inserted when registering ip6mr or a different one.

Would later be used by drivers wishing to offload ipv6 multicast routes
but unable to offload rules other than the default one.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6mr.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 0be2f333e168..a187c523a95f 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -268,6 +268,13 @@ static unsigned int ip6mr_rules_seq_read(struct net *net)
 {
 	return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR);
 }
+
+bool ip6mr_rule_default(const struct fib_rule *rule)
+{
+	return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL &&
+	       rule->table == RT6_TABLE_DFLT && !rule->l3mdev;
+}
+EXPORT_SYMBOL(ip6mr_rule_default);
 #else
 #define ip6mr_for_each_table(mrt, net) \
 	for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)
-- 
cgit v1.2.3


From 8c13af2a219c6498071b30ea558438c74267ae4d Mon Sep 17 00:00:00 2001
From: Yuval Mintz <yuvalm@mellanox.com>
Date: Mon, 26 Mar 2018 15:01:36 +0300
Subject: ip6mr: Add refcounting to mfc

Since ipmr and ip6mr are using the same mr_mfc struct at their core, we
can now refactor the ipmr_cache_{hold,put} logic and apply refcounting
to both ipmr and ip6mr.

Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c  | 8 ++++----
 net/ipv6/ip6mr.c | 6 ++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 24c340021aba..e79211a8537c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -732,11 +732,10 @@ static void ipmr_cache_free_rcu(struct rcu_head *head)
 	kmem_cache_free(mrt_cachep, (struct mfc_cache *)c);
 }
 
-void ipmr_cache_free(struct mfc_cache *c)
+static void ipmr_cache_free(struct mfc_cache *c)
 {
 	call_rcu(&c->_c.rcu, ipmr_cache_free_rcu);
 }
-EXPORT_SYMBOL(ipmr_cache_free);
 
 /* Destroy an unresolved cache entry, killing queued skbs
  * and reporting error to netlink readers.
@@ -987,6 +986,7 @@ static struct mfc_cache *ipmr_cache_alloc(void)
 	if (c) {
 		c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
 		c->_c.mfc_un.res.minvif = MAXVIFS;
+		c->_c.free = ipmr_cache_free_rcu;
 		refcount_set(&c->_c.mfc_un.res.refcount, 1);
 	}
 	return c;
@@ -1206,7 +1206,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
 	list_del_rcu(&c->_c.list);
 	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
 	mroute_netlink_event(mrt, c, RTM_DELROUTE);
-	ipmr_cache_put(c);
+	mr_cache_put(&c->_c);
 
 	return 0;
 }
@@ -1318,7 +1318,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
 					      mrt->id);
 		mroute_netlink_event(mrt, cache, RTM_DELROUTE);
-		ipmr_cache_put(cache);
+		mr_cache_put(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index a187c523a95f..1c8fa29d155a 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -989,6 +989,8 @@ static struct mfc6_cache *ip6mr_cache_alloc(void)
 		return NULL;
 	c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
 	c->_c.mfc_un.res.minvif = MAXMIFS;
+	c->_c.free = ip6mr_cache_free_rcu;
+	refcount_set(&c->_c.mfc_un.res.refcount, 1);
 	return c;
 }
 
@@ -1227,7 +1229,7 @@ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc,
 	call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
 				       FIB_EVENT_ENTRY_DEL, c, mrt->id);
 	mr6_netlink_event(mrt, c, RTM_DELROUTE);
-	ip6mr_cache_free(c);
+	mr_cache_put(&c->_c);
 	return 0;
 }
 
@@ -1516,7 +1518,7 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
 		rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
 		list_del_rcu(&c->list);
 		mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
-		ip6mr_cache_free((struct mfc6_cache *)c);
+		mr_cache_put(c);
 	}
 
 	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
-- 
cgit v1.2.3


From e1577c1c881b09e9f15a743a4a1907815b74d0f7 Mon Sep 17 00:00:00 2001
From: Inbar Karmy <inbark@mellanox.com>
Date: Mon, 20 Nov 2017 16:14:30 +0200
Subject: ethtool: Add support for configuring PFC stall prevention in ethtool

In the event where the device unexpectedly becomes unresponsive
for a long period of time, flow control mechanism may propagate
pause frames which will cause congestion spreading to the entire
network.
To prevent this scenario, when the device is stalled for a period
longer than a pre-configured timeout, flow control mechanisms are
automatically disabled.

This patch adds support for the ETHTOOL_PFC_STALL_PREVENTION
as a tunable.
This API provides support for configuring flow control storm prevention
timeout (msec).

Signed-off-by: Inbar Karmy <inbark@mellanox.com>
Cc: Michal Kubecek <mkubecek@suse.cz>
Cc: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 net/core/ethtool.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 157cd9efa4be..bb6e498c6e3d 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -121,6 +121,7 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
 	[ETHTOOL_ID_UNSPEC]     = "Unspec",
 	[ETHTOOL_RX_COPYBREAK]	= "rx-copybreak",
 	[ETHTOOL_TX_COPYBREAK]	= "tx-copybreak",
+	[ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
 };
 
 static const char
@@ -2311,6 +2312,11 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
 		    tuna->type_id != ETHTOOL_TUNABLE_U32)
 			return -EINVAL;
 		break;
+	case ETHTOOL_PFC_PREVENTION_TOUT:
+		if (tuna->len != sizeof(u16) ||
+		    tuna->type_id != ETHTOOL_TUNABLE_U16)
+			return -EINVAL;
+		break;
 	default:
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 5306653850b444452937834adc5a5ac63bae275e Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Mon, 26 Mar 2018 16:55:00 +0800
Subject: sctp: remove unnecessary asoc in sctp_has_association

After Commit dae399d7fdee ("sctp: hold transport instead of assoc
when lookup assoc in rx path"), it put transport instead of asoc
in sctp_has_association. Variable 'asoc' is not used any more.

So this patch is to remove it, while at it,  it also changes the
return type of sctp_has_association to bool, and does the same
for it's caller sctp_endpoint_is_peeled_off.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/endpointola.c |  8 ++++----
 net/sctp/input.c       | 13 ++++++-------
 2 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 8b3146816519..e2f5a3ee41a7 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -349,8 +349,8 @@ out:
 /* Look for any peeled off association from the endpoint that matches the
  * given peer address.
  */
-int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
-				const union sctp_addr *paddr)
+bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
+				 const union sctp_addr *paddr)
 {
 	struct sctp_sockaddr_entry *addr;
 	struct sctp_bind_addr *bp;
@@ -362,10 +362,10 @@ int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
 	 */
 	list_for_each_entry(addr, &bp->address_list, list) {
 		if (sctp_has_association(net, &addr->a, paddr))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 /* Do delayed input processing.  This is scheduled by sctp_rcv().
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b381d78548ac..ba8a6e6c36fa 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -1010,19 +1010,18 @@ struct sctp_association *sctp_lookup_association(struct net *net,
 }
 
 /* Is there an association matching the given local and peer addresses? */
-int sctp_has_association(struct net *net,
-			 const union sctp_addr *laddr,
-			 const union sctp_addr *paddr)
+bool sctp_has_association(struct net *net,
+			  const union sctp_addr *laddr,
+			  const union sctp_addr *paddr)
 {
-	struct sctp_association *asoc;
 	struct sctp_transport *transport;
 
-	if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) {
+	if (sctp_lookup_association(net, laddr, paddr, &transport)) {
 		sctp_transport_put(transport);
-		return 1;
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 /*
-- 
cgit v1.2.3


From 8daf1a2d7e40685054ebae680733d822ced6df62 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 26 Mar 2018 12:27:12 +0100
Subject: net/ncsi: check for null return from call to nla_nest_start

The call to nla_nest_start calls nla_put which can lead to a NULL
return so it's possible for attr to become NULL and we can potentially
get a NULL pointer dereference on attr.  Fix this by checking for
a NULL return.

Detected by CoverityScan, CID#1466125 ("Dereference null return")

Fixes: 955dc68cb9b2 ("net/ncsi: Add generic netlink family")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ncsi/ncsi-netlink.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c
index 05fcfb4fbe1d..8d7e849d4825 100644
--- a/net/ncsi/ncsi-netlink.c
+++ b/net/ncsi/ncsi-netlink.c
@@ -190,6 +190,10 @@ static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info)
 	package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]);
 
 	attr = nla_nest_start(skb, NCSI_ATTR_PACKAGE_LIST);
+	if (!attr) {
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
 	rc = ncsi_write_package_info(skb, ndp, package_id);
 
 	if (rc) {
-- 
cgit v1.2.3


From c76f2481b60a62814f4cb1678e48efa3385895aa Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Mon, 26 Mar 2018 14:32:44 +0000
Subject: tipc: fix error handling in tipc_udp_enable()

Release alloced resource before return from the error handling
case in tipc_udp_enable(), otherwise will cause memory leak.

Fixes: 52dfae5c85a4 ("tipc: obtain node identity from interface by default")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/udp_media.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index 2c13b18426d9..e7d91f5d5cae 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -687,7 +687,8 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
 	}
 	if (!tipc_own_id(net)) {
 		pr_warn("Failed to set node id, please configure manually\n");
-		return -EINVAL;
+		err = -EINVAL;
+		goto err;
 	}
 
 	b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
-- 
cgit v1.2.3


From e1a22d13eb1f302afd692583777e27828d375a39 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Mon, 26 Mar 2018 14:33:13 +0000
Subject: tipc: tipc_node_create() can be static

Fixes the following sparse warning:

net/tipc/node.c:336:18: warning:
 symbol 'tipc_node_create' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/node.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/node.c b/net/tipc/node.c
index 4a95c8c155c6..4fb4327311bb 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -333,8 +333,8 @@ static void tipc_node_write_unlock(struct tipc_node *n)
 	}
 }
 
-struct tipc_node *tipc_node_create(struct net *net, u32 addr,
-				   u8 *peer_id, u16 capabilities)
+static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
+					  u8 *peer_id, u16 capabilities)
 {
 	struct tipc_net *tn = net_generic(net, tipc_net_id);
 	struct tipc_node *n, *temp_node;
-- 
cgit v1.2.3


From e32ac25018558f7bcab387708187ab5aa2733cf8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 26 Mar 2018 08:35:01 -0700
Subject: ipv6: addrconf: Use normal debugging style

Remove local ADBG macro and use netdev_dbg/pr_debug

Miscellanea:

o Remove unnecessary debug message after allocation failure as there
  already is a dump_stack() on the failure paths
o Leave the allocation failure message on snmp6_alloc_dev as there
  is one code path that does not do a dump_stack()

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 9ca90cd9fba4..189eac80f4ef 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -94,15 +94,6 @@
 #include <linux/seq_file.h>
 #include <linux/export.h>
 
-/* Set to 3 to get tracing... */
-#define ACONF_DEBUG 2
-
-#if ACONF_DEBUG >= 3
-#define ADBG(fmt, ...) printk(fmt, ##__VA_ARGS__)
-#else
-#define ADBG(fmt, ...) do { if (0) printk(fmt, ##__VA_ARGS__); } while (0)
-#endif
-
 #define	INFINITY_LIFE_TIME	0xFFFFFFFF
 
 #define IPV6_MAX_STRLEN \
@@ -409,9 +400,8 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	dev_hold(dev);
 
 	if (snmp6_alloc_dev(ndev) < 0) {
-		ADBG(KERN_WARNING
-			"%s: cannot allocate memory for statistics; dev=%s.\n",
-			__func__, dev->name);
+		netdev_dbg(dev, "%s: cannot allocate memory for statistics\n",
+			   __func__);
 		neigh_parms_release(&nd_tbl, ndev->nd_parms);
 		dev_put(dev);
 		kfree(ndev);
@@ -419,9 +409,8 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	}
 
 	if (snmp6_register_dev(ndev) < 0) {
-		ADBG(KERN_WARNING
-			"%s: cannot create /proc/net/dev_snmp6/%s\n",
-			__func__, dev->name);
+		netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
+			   __func__, dev->name);
 		goto err_release;
 	}
 
@@ -984,7 +973,7 @@ static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
 
 	/* Ignore adding duplicate addresses on an interface */
 	if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) {
-		ADBG("ipv6_add_addr: already assigned\n");
+		netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
 		err = -EEXIST;
 	} else {
 		hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
@@ -1044,7 +1033,6 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 
 	ifa = kzalloc(sizeof(*ifa), gfp_flags);
 	if (!ifa) {
-		ADBG("ipv6_add_addr: malloc failed\n");
 		err = -ENOBUFS;
 		goto out;
 	}
@@ -2618,7 +2606,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
 	pinfo = (struct prefix_info *) opt;
 
 	if (len < sizeof(struct prefix_info)) {
-		ADBG("addrconf: prefix option too short\n");
+		netdev_dbg(dev, "addrconf: prefix option too short\n");
 		return;
 	}
 
@@ -4446,8 +4434,8 @@ restart:
 	if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
 		next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;
 
-	ADBG(KERN_DEBUG "now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
-	      now, next, next_sec, next_sched);
+	pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
+		 now, next, next_sec, next_sched);
 	mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now);
 	rcu_read_unlock_bh();
 }
-- 
cgit v1.2.3


From 094374e5e173c6639eccf6a2af5e1357a0869848 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 27 Mar 2018 18:02:01 +0300
Subject: net: Reflect all pernet_operations are converted

All pernet_operations are reviewed and converted, hooray!
Reflect this in core code: setup_net() and cleanup_net()
will take down_read() always.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 43 ++++++-------------------------------------
 1 file changed, 6 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 95ba2c53bd9a..0f614523a13f 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -40,9 +40,8 @@ struct net init_net = {
 EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
-static unsigned nr_sync_pernet_ops;
 /*
- * net_sem: protects: pernet_list, net_generic_ids, nr_sync_pernet_ops,
+ * net_sem: protects: pernet_list, net_generic_ids,
  * init_net_initialized and first_device pointer.
  */
 DECLARE_RWSEM(net_sem);
@@ -406,7 +405,6 @@ struct net *copy_net_ns(unsigned long flags,
 {
 	struct ucounts *ucounts;
 	struct net *net;
-	unsigned write;
 	int rv;
 
 	if (!(flags & CLONE_NEWNET))
@@ -424,25 +422,14 @@ struct net *copy_net_ns(unsigned long flags,
 	refcount_set(&net->passive, 1);
 	net->ucounts = ucounts;
 	get_user_ns(user_ns);
-again:
-	write = READ_ONCE(nr_sync_pernet_ops);
-	if (write)
-		rv = down_write_killable(&net_sem);
-	else
-		rv = down_read_killable(&net_sem);
+
+	rv = down_read_killable(&net_sem);
 	if (rv < 0)
 		goto put_userns;
 
-	if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
-		up_read(&net_sem);
-		goto again;
-	}
 	rv = setup_net(net, user_ns);
 
-	if (write)
-		up_write(&net_sem);
-	else
-		up_read(&net_sem);
+	up_read(&net_sem);
 
 	if (rv < 0) {
 put_userns:
@@ -490,21 +477,11 @@ static void cleanup_net(struct work_struct *work)
 	struct net *net, *tmp, *last;
 	struct llist_node *net_kill_list;
 	LIST_HEAD(net_exit_list);
-	unsigned write;
 
 	/* Atomically snapshot the list of namespaces to cleanup */
 	net_kill_list = llist_del_all(&cleanup_list);
-again:
-	write = READ_ONCE(nr_sync_pernet_ops);
-	if (write)
-		down_write(&net_sem);
-	else
-		down_read(&net_sem);
 
-	if (!write && unlikely(READ_ONCE(nr_sync_pernet_ops))) {
-		up_read(&net_sem);
-		goto again;
-	}
+	down_read(&net_sem);
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
@@ -543,10 +520,7 @@ again:
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
 
-	if (write)
-		up_write(&net_sem);
-	else
-		up_read(&net_sem);
+	up_read(&net_sem);
 
 	/* Ensure there are no outstanding rcu callbacks using this
 	 * network namespace.
@@ -1006,9 +980,6 @@ again:
 		rcu_barrier();
 		if (ops->id)
 			ida_remove(&net_generic_ids, *ops->id);
-	} else if (!ops->async) {
-		pr_info_once("Pernet operations %ps are sync.\n", ops);
-		nr_sync_pernet_ops++;
 	}
 
 	return error;
@@ -1016,8 +987,6 @@ again:
 
 static void unregister_pernet_operations(struct pernet_operations *ops)
 {
-	if (!ops->async)
-		BUG_ON(nr_sync_pernet_ops-- == 0);
 	__unregister_pernet_operations(ops);
 	rcu_barrier();
 	if (ops->id)
-- 
cgit v1.2.3


From 2f635ceeb22ba13c307236d69795fbb29cfa3e7c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 27 Mar 2018 18:02:13 +0300
Subject: net: Drop pernet_operations::async

Synchronous pernet_operations are not allowed anymore.
All are asynchronous. So, drop the structure member.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c                               | 1 -
 net/bridge/br.c                                | 1 -
 net/bridge/br_netfilter_hooks.c                | 1 -
 net/bridge/netfilter/ebtable_broute.c          | 1 -
 net/bridge/netfilter/ebtable_filter.c          | 1 -
 net/bridge/netfilter/ebtable_nat.c             | 1 -
 net/bridge/netfilter/nf_log_bridge.c           | 1 -
 net/caif/caif_dev.c                            | 1 -
 net/can/af_can.c                               | 1 -
 net/can/bcm.c                                  | 1 -
 net/can/gw.c                                   | 1 -
 net/core/dev.c                                 | 2 --
 net/core/fib_notifier.c                        | 1 -
 net/core/fib_rules.c                           | 1 -
 net/core/net-procfs.c                          | 2 --
 net/core/net_namespace.c                       | 2 --
 net/core/pktgen.c                              | 1 -
 net/core/rtnetlink.c                           | 1 -
 net/core/sock.c                                | 2 --
 net/core/sock_diag.c                           | 1 -
 net/core/sysctl_net_core.c                     | 1 -
 net/dccp/ipv4.c                                | 1 -
 net/dccp/ipv6.c                                | 1 -
 net/ieee802154/6lowpan/reassembly.c            | 1 -
 net/ieee802154/core.c                          | 1 -
 net/ipv4/af_inet.c                             | 2 --
 net/ipv4/arp.c                                 | 1 -
 net/ipv4/devinet.c                             | 1 -
 net/ipv4/fib_frontend.c                        | 1 -
 net/ipv4/fou.c                                 | 1 -
 net/ipv4/icmp.c                                | 1 -
 net/ipv4/igmp.c                                | 1 -
 net/ipv4/ip_fragment.c                         | 1 -
 net/ipv4/ip_gre.c                              | 3 ---
 net/ipv4/ip_vti.c                              | 1 -
 net/ipv4/ipip.c                                | 1 -
 net/ipv4/ipmr.c                                | 1 -
 net/ipv4/netfilter/arp_tables.c                | 1 -
 net/ipv4/netfilter/arptable_filter.c           | 1 -
 net/ipv4/netfilter/ip_tables.c                 | 1 -
 net/ipv4/netfilter/ipt_CLUSTERIP.c             | 1 -
 net/ipv4/netfilter/iptable_filter.c            | 1 -
 net/ipv4/netfilter/iptable_mangle.c            | 1 -
 net/ipv4/netfilter/iptable_nat.c               | 1 -
 net/ipv4/netfilter/iptable_raw.c               | 1 -
 net/ipv4/netfilter/iptable_security.c          | 1 -
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 1 -
 net/ipv4/netfilter/nf_defrag_ipv4.c            | 1 -
 net/ipv4/netfilter/nf_log_arp.c                | 1 -
 net/ipv4/netfilter/nf_log_ipv4.c               | 1 -
 net/ipv4/ping.c                                | 1 -
 net/ipv4/proc.c                                | 1 -
 net/ipv4/raw.c                                 | 1 -
 net/ipv4/route.c                               | 4 ----
 net/ipv4/sysctl_net_ipv4.c                     | 1 -
 net/ipv4/tcp_ipv4.c                            | 2 --
 net/ipv4/tcp_metrics.c                         | 1 -
 net/ipv4/udp.c                                 | 2 --
 net/ipv4/udplite.c                             | 1 -
 net/ipv4/xfrm4_policy.c                        | 1 -
 net/ipv6/addrconf.c                            | 2 --
 net/ipv6/addrlabel.c                           | 1 -
 net/ipv6/af_inet6.c                            | 1 -
 net/ipv6/fib6_rules.c                          | 1 -
 net/ipv6/icmp.c                                | 1 -
 net/ipv6/ila/ila_xlat.c                        | 1 -
 net/ipv6/ip6_fib.c                             | 1 -
 net/ipv6/ip6_flowlabel.c                       | 1 -
 net/ipv6/ip6_gre.c                             | 1 -
 net/ipv6/ip6_tunnel.c                          | 1 -
 net/ipv6/ip6_vti.c                             | 1 -
 net/ipv6/ip6mr.c                               | 1 -
 net/ipv6/mcast.c                               | 1 -
 net/ipv6/ndisc.c                               | 1 -
 net/ipv6/netfilter/ip6_tables.c                | 1 -
 net/ipv6/netfilter/ip6table_filter.c           | 1 -
 net/ipv6/netfilter/ip6table_mangle.c           | 1 -
 net/ipv6/netfilter/ip6table_nat.c              | 1 -
 net/ipv6/netfilter/ip6table_raw.c              | 1 -
 net/ipv6/netfilter/ip6table_security.c         | 1 -
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 1 -
 net/ipv6/netfilter/nf_conntrack_reasm.c        | 1 -
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c      | 1 -
 net/ipv6/netfilter/nf_log_ipv6.c               | 1 -
 net/ipv6/ping.c                                | 1 -
 net/ipv6/proc.c                                | 1 -
 net/ipv6/raw.c                                 | 1 -
 net/ipv6/reassembly.c                          | 1 -
 net/ipv6/route.c                               | 3 ---
 net/ipv6/seg6.c                                | 1 -
 net/ipv6/sit.c                                 | 1 -
 net/ipv6/sysctl_net_ipv6.c                     | 1 -
 net/ipv6/tcp_ipv6.c                            | 1 -
 net/ipv6/udplite.c                             | 1 -
 net/ipv6/xfrm6_policy.c                        | 1 -
 net/ipv6/xfrm6_tunnel.c                        | 1 -
 net/kcm/kcmproc.c                              | 1 -
 net/kcm/kcmsock.c                              | 1 -
 net/key/af_key.c                               | 1 -
 net/l2tp/l2tp_core.c                           | 1 -
 net/l2tp/l2tp_ppp.c                            | 1 -
 net/mpls/af_mpls.c                             | 1 -
 net/netfilter/core.c                           | 1 -
 net/netfilter/ipset/ip_set_core.c              | 1 -
 net/netfilter/ipvs/ip_vs_core.c                | 2 --
 net/netfilter/ipvs/ip_vs_ftp.c                 | 1 -
 net/netfilter/ipvs/ip_vs_lblc.c                | 1 -
 net/netfilter/ipvs/ip_vs_lblcr.c               | 1 -
 net/netfilter/nf_conntrack_netlink.c           | 1 -
 net/netfilter/nf_conntrack_proto_gre.c         | 1 -
 net/netfilter/nf_conntrack_standalone.c        | 1 -
 net/netfilter/nf_log.c                         | 1 -
 net/netfilter/nf_log_netdev.c                  | 1 -
 net/netfilter/nf_synproxy_core.c               | 1 -
 net/netfilter/nf_tables_api.c                  | 1 -
 net/netfilter/nfnetlink.c                      | 1 -
 net/netfilter/nfnetlink_acct.c                 | 1 -
 net/netfilter/nfnetlink_cttimeout.c            | 1 -
 net/netfilter/nfnetlink_log.c                  | 1 -
 net/netfilter/nfnetlink_queue.c                | 1 -
 net/netfilter/x_tables.c                       | 1 -
 net/netfilter/xt_hashlimit.c                   | 1 -
 net/netfilter/xt_recent.c                      | 1 -
 net/netlink/af_netlink.c                       | 2 --
 net/netlink/genetlink.c                        | 1 -
 net/openvswitch/datapath.c                     | 1 -
 net/packet/af_packet.c                         | 1 -
 net/phonet/pn_dev.c                            | 1 -
 net/rds/tcp.c                                  | 1 -
 net/rxrpc/net_ns.c                             | 1 -
 net/sched/act_api.c                            | 1 -
 net/sched/act_bpf.c                            | 1 -
 net/sched/act_connmark.c                       | 1 -
 net/sched/act_csum.c                           | 1 -
 net/sched/act_gact.c                           | 1 -
 net/sched/act_ife.c                            | 1 -
 net/sched/act_ipt.c                            | 2 --
 net/sched/act_mirred.c                         | 1 -
 net/sched/act_nat.c                            | 1 -
 net/sched/act_pedit.c                          | 1 -
 net/sched/act_police.c                         | 1 -
 net/sched/act_sample.c                         | 1 -
 net/sched/act_simple.c                         | 1 -
 net/sched/act_skbedit.c                        | 1 -
 net/sched/act_skbmod.c                         | 1 -
 net/sched/act_tunnel_key.c                     | 1 -
 net/sched/act_vlan.c                           | 1 -
 net/sched/cls_api.c                            | 1 -
 net/sched/sch_api.c                            | 1 -
 net/sctp/protocol.c                            | 2 --
 net/sunrpc/auth_gss/auth_gss.c                 | 1 -
 net/sunrpc/sunrpc_syms.c                       | 1 -
 net/sysctl_net.c                               | 1 -
 net/tipc/core.c                                | 1 -
 net/unix/af_unix.c                             | 1 -
 net/wireless/core.c                            | 1 -
 net/wireless/wext-core.c                       | 1 -
 net/xfrm/xfrm_policy.c                         | 1 -
 net/xfrm/xfrm_user.c                           | 1 -
 159 files changed, 178 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index bd0ed39f65fb..bad01b14a4ad 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -729,7 +729,6 @@ static struct pernet_operations vlan_net_ops = {
 	.exit = vlan_exit_net,
 	.id   = &vlan_net_id,
 	.size = sizeof(struct vlan_net),
-	.async = true,
 };
 
 static int __init vlan_proto_init(void)
diff --git a/net/bridge/br.c b/net/bridge/br.c
index a3f95ab9d6a3..26e1616b2c90 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -188,7 +188,6 @@ static void __net_exit br_net_exit(struct net *net)
 
 static struct pernet_operations br_net_ops = {
 	.exit	= br_net_exit,
-	.async	= true,
 };
 
 static const struct stp_proto br_stp_proto = {
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index c2120eb889a9..9b16eaf33819 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -969,7 +969,6 @@ static struct pernet_operations brnf_net_ops __read_mostly = {
 	.exit = brnf_exit_net,
 	.id   = &brnf_net_id,
 	.size = sizeof(struct brnf_net),
-	.async = true,
 };
 
 static struct notifier_block brnf_notifier __read_mostly = {
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index f070b5e5b9dd..276b60262981 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -77,7 +77,6 @@ static void __net_exit broute_net_exit(struct net *net)
 static struct pernet_operations broute_net_ops = {
 	.init = broute_net_init,
 	.exit = broute_net_exit,
-	.async = true,
 };
 
 static int __init ebtable_broute_init(void)
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 4151afc8efcc..c41da5fac84f 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -105,7 +105,6 @@ static void __net_exit frame_filter_net_exit(struct net *net)
 static struct pernet_operations frame_filter_net_ops = {
 	.init = frame_filter_net_init,
 	.exit = frame_filter_net_exit,
-	.async = true,
 };
 
 static int __init ebtable_filter_init(void)
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index b8da2dfe2ec5..08df7406ecb3 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -105,7 +105,6 @@ static void __net_exit frame_nat_net_exit(struct net *net)
 static struct pernet_operations frame_nat_net_ops = {
 	.init = frame_nat_net_init,
 	.exit = frame_nat_net_exit,
-	.async = true,
 };
 
 static int __init ebtable_nat_init(void)
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
index 91bfc2ac055a..bd2b3c78f59b 100644
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ b/net/bridge/netfilter/nf_log_bridge.c
@@ -48,7 +48,6 @@ static void __net_exit nf_log_bridge_net_exit(struct net *net)
 static struct pernet_operations nf_log_bridge_net_ops = {
 	.init = nf_log_bridge_net_init,
 	.exit = nf_log_bridge_net_exit,
-	.async = true,
 };
 
 static int __init nf_log_bridge_init(void)
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 7a78268cc572..e0adcd123f48 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -544,7 +544,6 @@ static struct pernet_operations caif_net_ops = {
 	.exit = caif_exit_net,
 	.id   = &caif_net_id,
 	.size = sizeof(struct caif_net),
-	.async = true,
 };
 
 /* Initialize Caif devices list */
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 2f0d0a72e4b5..1684ba5b51eb 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -954,7 +954,6 @@ static struct notifier_block can_netdev_notifier __read_mostly = {
 static struct pernet_operations can_pernet_ops __read_mostly = {
 	.init = can_pernet_init,
 	.exit = can_pernet_exit,
-	.async = true,
 };
 
 static __init int can_init(void)
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 26730d39e048..ac5e5e34fee3 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1717,7 +1717,6 @@ static void canbcm_pernet_exit(struct net *net)
 static struct pernet_operations canbcm_pernet_ops __read_mostly = {
 	.init = canbcm_pernet_init,
 	.exit = canbcm_pernet_exit,
-	.async = true,
 };
 
 static int __init bcm_module_init(void)
diff --git a/net/can/gw.c b/net/can/gw.c
index 8d71e199d5b3..faa3da88a127 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1010,7 +1010,6 @@ static void __net_exit cangw_pernet_exit(struct net *net)
 static struct pernet_operations cangw_pernet_ops = {
 	.init = cangw_pernet_init,
 	.exit = cangw_pernet_exit,
-	.async = true,
 };
 
 static __init int cgw_module_init(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index 97a96df4b6da..e13807b5c84d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8883,7 +8883,6 @@ static void __net_exit netdev_exit(struct net *net)
 static struct pernet_operations __net_initdata netdev_net_ops = {
 	.init = netdev_init,
 	.exit = netdev_exit,
-	.async = true,
 };
 
 static void __net_exit default_device_exit(struct net *net)
@@ -8984,7 +8983,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
 static struct pernet_operations __net_initdata default_device_ops = {
 	.exit = default_device_exit,
 	.exit_batch = default_device_exit_batch,
-	.async = true,
 };
 
 /*
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 5ace0705a3f9..0c048bdeb016 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -171,7 +171,6 @@ static void __net_exit fib_notifier_net_exit(struct net *net)
 static struct pernet_operations fib_notifier_net_ops = {
 	.init = fib_notifier_net_init,
 	.exit = fib_notifier_net_exit,
-	.async = true,
 };
 
 static int __init fib_notifier_init(void)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index f6f04fc0f629..9d87ce868402 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -1130,7 +1130,6 @@ static void __net_exit fib_rules_net_exit(struct net *net)
 static struct pernet_operations fib_rules_net_ops = {
 	.init = fib_rules_net_init,
 	.exit = fib_rules_net_exit,
-	.async = true,
 };
 
 static int __init fib_rules_init(void)
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index dd6ae431d038..9737302907b1 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -349,7 +349,6 @@ static void __net_exit dev_proc_net_exit(struct net *net)
 static struct pernet_operations __net_initdata dev_proc_ops = {
 	.init = dev_proc_net_init,
 	.exit = dev_proc_net_exit,
-	.async = true,
 };
 
 static int dev_mc_seq_show(struct seq_file *seq, void *v)
@@ -406,7 +405,6 @@ static void __net_exit dev_mc_net_exit(struct net *net)
 static struct pernet_operations __net_initdata dev_mc_net_ops = {
 	.init = dev_mc_net_init,
 	.exit = dev_mc_net_exit,
-	.async = true,
 };
 
 int __init dev_proc_init(void)
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0f614523a13f..eef17ad29dea 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -338,7 +338,6 @@ static int __net_init net_defaults_init_net(struct net *net)
 
 static struct pernet_operations net_defaults_ops = {
 	.init = net_defaults_init_net,
-	.async = true,
 };
 
 static __init int net_defaults_init(void)
@@ -628,7 +627,6 @@ static __net_exit void net_ns_net_exit(struct net *net)
 static struct pernet_operations __net_initdata net_ns_ops = {
 	.init = net_ns_net_init,
 	.exit = net_ns_net_exit,
-	.async = true,
 };
 
 static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 545cf08cd558..7e4ede34cc52 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3852,7 +3852,6 @@ static struct pernet_operations pg_net_ops = {
 	.exit = pg_net_exit,
 	.id   = &pg_net_id,
 	.size = sizeof(struct pktgen_net),
-	.async = true,
 };
 
 static int __init pg_init(void)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 87079eaa871b..31438b63d4b4 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -4730,7 +4730,6 @@ static void __net_exit rtnetlink_net_exit(struct net *net)
 static struct pernet_operations rtnetlink_net_ops = {
 	.init = rtnetlink_net_init,
 	.exit = rtnetlink_net_exit,
-	.async = true,
 };
 
 void __init rtnetlink_init(void)
diff --git a/net/core/sock.c b/net/core/sock.c
index 8cee2920a47f..6444525f610c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3175,7 +3175,6 @@ static void __net_exit sock_inuse_exit_net(struct net *net)
 static struct pernet_operations net_inuse_ops = {
 	.init = sock_inuse_init_net,
 	.exit = sock_inuse_exit_net,
-	.async = true,
 };
 
 static __init int net_inuse_init(void)
@@ -3470,7 +3469,6 @@ static __net_exit void proto_exit_net(struct net *net)
 static __net_initdata struct pernet_operations proto_net_ops = {
 	.init = proto_init_net,
 	.exit = proto_exit_net,
-	.async = true,
 };
 
 static int __init proto_init(void)
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a3392a8f9276..c37b5be7c5e4 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -324,7 +324,6 @@ static void __net_exit diag_net_exit(struct net *net)
 static struct pernet_operations diag_net_ops = {
 	.init = diag_net_init,
 	.exit = diag_net_exit,
-	.async = true,
 };
 
 static int __init sock_diag_init(void)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 4f47f92459cc..b3b609f0eeb5 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -584,7 +584,6 @@ static __net_exit void sysctl_core_net_exit(struct net *net)
 static __net_initdata struct pernet_operations sysctl_core_ops = {
 	.init = sysctl_core_net_init,
 	.exit = sysctl_core_net_exit,
-	.async = true,
 };
 
 static __init int sysctl_core_init(void)
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 13ad28ab1e79..e65fcb45c3f6 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1031,7 +1031,6 @@ static struct pernet_operations dccp_v4_ops = {
 	.init	= dccp_v4_init_net,
 	.exit	= dccp_v4_exit_net,
 	.exit_batch = dccp_v4_exit_batch,
-	.async	= true,
 };
 
 static int __init dccp_v4_init(void)
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 2f48c020f8c3..5df7857fc0f3 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1116,7 +1116,6 @@ static struct pernet_operations dccp_v6_ops = {
 	.init   = dccp_v6_init_net,
 	.exit   = dccp_v6_exit_net,
 	.exit_batch = dccp_v6_exit_batch,
-	.async	= true,
 };
 
 static int __init dccp_v6_init(void)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index a9ccb1322f69..85bf86ad6b18 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -603,7 +603,6 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 static struct pernet_operations lowpan_frags_ops = {
 	.init = lowpan_frags_init_net,
 	.exit = lowpan_frags_exit_net,
-	.async = true,
 };
 
 int __init lowpan_net_frag_init(void)
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index 9104943c15ba..cb7176cd4cd6 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -345,7 +345,6 @@ static void __net_exit cfg802154_pernet_exit(struct net *net)
 
 static struct pernet_operations cfg802154_pernet_ops = {
 	.exit = cfg802154_pernet_exit,
-	.async = true,
 };
 
 static int __init wpan_phy_class_init(void)
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e8c7fad8c329..f98e2f0db841 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1735,7 +1735,6 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
 static __net_initdata struct pernet_operations ipv4_mib_ops = {
 	.init = ipv4_mib_init_net,
 	.exit = ipv4_mib_exit_net,
-	.async = true,
 };
 
 static int __init init_ipv4_mibs(void)
@@ -1789,7 +1788,6 @@ static __net_exit void inet_exit_net(struct net *net)
 static __net_initdata struct pernet_operations af_inet_ops = {
 	.init = inet_init_net,
 	.exit = inet_exit_net,
-	.async = true,
 };
 
 static int __init init_inet_pernet_ops(void)
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4c6ba0fb2630..be4c595edccb 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1447,7 +1447,6 @@ static void __net_exit arp_net_exit(struct net *net)
 static struct pernet_operations arp_net_ops = {
 	.init = arp_net_init,
 	.exit = arp_net_exit,
-	.async = true,
 };
 
 static int __init arp_proc_init(void)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5ae0d1f097ca..40f001782c1b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2469,7 +2469,6 @@ static __net_exit void devinet_exit_net(struct net *net)
 static __net_initdata struct pernet_operations devinet_ops = {
 	.init = devinet_init_net,
 	.exit = devinet_exit_net,
-	.async = true,
 };
 
 static struct rtnl_af_ops inet_af_ops __read_mostly = {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index ac71c3d496c0..f05afaf3235c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1362,7 +1362,6 @@ static void __net_exit fib_net_exit(struct net *net)
 static struct pernet_operations fib_net_ops = {
 	.init = fib_net_init,
 	.exit = fib_net_exit,
-	.async = true,
 };
 
 void __init ip_fib_init(void)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index d3e1a9af478b..1540db65241a 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -1081,7 +1081,6 @@ static struct pernet_operations fou_net_ops = {
 	.exit = fou_exit_net,
 	.id   = &fou_net_id,
 	.size = sizeof(struct fou_net),
-	.async = true,
 };
 
 static int __init fou_init(void)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index cc56efa64d5c..1617604c9284 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1257,7 +1257,6 @@ fail:
 static struct pernet_operations __net_initdata icmp_sk_ops = {
        .init = icmp_sk_init,
        .exit = icmp_sk_exit,
-       .async = true,
 };
 
 int __init icmp_init(void)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index f17cd83ba164..b26a81a7de42 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -3028,7 +3028,6 @@ static void __net_exit igmp_net_exit(struct net *net)
 static struct pernet_operations igmp_net_ops = {
 	.init = igmp_net_init,
 	.exit = igmp_net_exit,
-	.async = true,
 };
 #endif
 
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 5e843ae5e468..bbf1b94942c0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -885,7 +885,6 @@ static void __net_exit ipv4_frags_exit_net(struct net *net)
 static struct pernet_operations ip4_frags_ops = {
 	.init = ipv4_frags_init_net,
 	.exit = ipv4_frags_exit_net,
-	.async = true,
 };
 
 void __init ipfrag_init(void)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9ab1aa2f7660..a8772a978224 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1044,7 +1044,6 @@ static struct pernet_operations ipgre_net_ops = {
 	.exit_batch = ipgre_exit_batch_net,
 	.id   = &ipgre_net_id,
 	.size = sizeof(struct ip_tunnel_net),
-	.async = true,
 };
 
 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1628,7 +1627,6 @@ static struct pernet_operations ipgre_tap_net_ops = {
 	.exit_batch = ipgre_tap_exit_batch_net,
 	.id   = &gre_tap_net_id,
 	.size = sizeof(struct ip_tunnel_net),
-	.async = true,
 };
 
 static int __net_init erspan_init_net(struct net *net)
@@ -1647,7 +1645,6 @@ static struct pernet_operations erspan_net_ops = {
 	.exit_batch = erspan_exit_batch_net,
 	.id   = &erspan_net_id,
 	.size = sizeof(struct ip_tunnel_net),
-	.async = true,
 };
 
 static int __init ipgre_init(void)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index b10bf563afd9..51b1669334fe 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -454,7 +454,6 @@ static struct pernet_operations vti_net_ops = {
 	.exit_batch = vti_exit_batch_net,
 	.id   = &vti_net_id,
 	.size = sizeof(struct ip_tunnel_net),
-	.async = true,
 };
 
 static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 9c5a4d164f09..c891235b4966 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -669,7 +669,6 @@ static struct pernet_operations ipip_net_ops = {
 	.exit_batch = ipip_exit_batch_net,
 	.id   = &ipip_net_id,
 	.size = sizeof(struct ip_tunnel_net),
-	.async = true,
 };
 
 static int __init ipip_init(void)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e79211a8537c..2fb4de3f7f66 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -3009,7 +3009,6 @@ static void __net_exit ipmr_net_exit(struct net *net)
 static struct pernet_operations ipmr_net_ops = {
 	.init = ipmr_net_init,
 	.exit = ipmr_net_exit,
-	.async = true,
 };
 
 int __init ip_mr_init(void)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index c36ffce3c812..e3e420f3ba7b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1635,7 +1635,6 @@ static void __net_exit arp_tables_net_exit(struct net *net)
 static struct pernet_operations arp_tables_net_ops = {
 	.init = arp_tables_net_init,
 	.exit = arp_tables_net_exit,
-	.async = true,
 };
 
 static int __init arp_tables_init(void)
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 49c2490193ae..8f8713b4388f 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -65,7 +65,6 @@ static void __net_exit arptable_filter_net_exit(struct net *net)
 
 static struct pernet_operations arptable_filter_net_ops = {
 	.exit = arptable_filter_net_exit,
-	.async = true,
 };
 
 static int __init arptable_filter_init(void)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d4f7584d2dbe..e38395a8dcf2 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1916,7 +1916,6 @@ static void __net_exit ip_tables_net_exit(struct net *net)
 static struct pernet_operations ip_tables_net_ops = {
 	.init = ip_tables_net_init,
 	.exit = ip_tables_net_exit,
-	.async = true,
 };
 
 static int __init ip_tables_init(void)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 31b4cca588d0..2c8d313ae216 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -845,7 +845,6 @@ static struct pernet_operations clusterip_net_ops = {
 	.exit = clusterip_net_exit,
 	.id   = &clusterip_net_id,
 	.size = sizeof(struct clusterip_net),
-	.async = true,
 };
 
 static int __init clusterip_tg_init(void)
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index c1c136a93911..9ac92ea7b93c 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -87,7 +87,6 @@ static void __net_exit iptable_filter_net_exit(struct net *net)
 static struct pernet_operations iptable_filter_net_ops = {
 	.init = iptable_filter_net_init,
 	.exit = iptable_filter_net_exit,
-	.async = true,
 };
 
 static int __init iptable_filter_init(void)
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index f6074059531a..dea138ca8925 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -113,7 +113,6 @@ static void __net_exit iptable_mangle_net_exit(struct net *net)
 
 static struct pernet_operations iptable_mangle_net_ops = {
 	.exit = iptable_mangle_net_exit,
-	.async = true,
 };
 
 static int __init iptable_mangle_init(void)
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index b771af74be79..0f7255cc65ee 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -129,7 +129,6 @@ static void __net_exit iptable_nat_net_exit(struct net *net)
 
 static struct pernet_operations iptable_nat_net_ops = {
 	.exit	= iptable_nat_net_exit,
-	.async	= true,
 };
 
 static int __init iptable_nat_init(void)
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 963753e50842..960625aabf04 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -76,7 +76,6 @@ static void __net_exit iptable_raw_net_exit(struct net *net)
 
 static struct pernet_operations iptable_raw_net_ops = {
 	.exit = iptable_raw_net_exit,
-	.async = true,
 };
 
 static int __init iptable_raw_init(void)
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c40d6b3d8b6a..e5379fe57b64 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -76,7 +76,6 @@ static void __net_exit iptable_security_net_exit(struct net *net)
 
 static struct pernet_operations iptable_security_net_ops = {
 	.exit = iptable_security_net_exit,
-	.async = true,
 };
 
 static int __init iptable_security_init(void)
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 6531f69db010..b50721d9d30e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -399,7 +399,6 @@ static struct pernet_operations ipv4_net_ops = {
 	.exit = ipv4_net_exit,
 	.id = &conntrack4_net_id,
 	.size = sizeof(struct conntrack4_net),
-	.async = true,
 };
 
 static int __init nf_conntrack_l3proto_ipv4_init(void)
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 57244b62a4fc..a0d3ad60a411 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -118,7 +118,6 @@ static void __net_exit defrag4_net_exit(struct net *net)
 
 static struct pernet_operations defrag4_net_ops = {
 	.exit = defrag4_net_exit,
-	.async = true,
 };
 
 static int __init nf_defrag_init(void)
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
index 162293469ac2..df5c2a2061a4 100644
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ b/net/ipv4/netfilter/nf_log_arp.c
@@ -122,7 +122,6 @@ static void __net_exit nf_log_arp_net_exit(struct net *net)
 static struct pernet_operations nf_log_arp_net_ops = {
 	.init = nf_log_arp_net_init,
 	.exit = nf_log_arp_net_exit,
-	.async = true,
 };
 
 static int __init nf_log_arp_init(void)
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
index 7a06de140f3c..4388de0e5380 100644
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -358,7 +358,6 @@ static void __net_exit nf_log_ipv4_net_exit(struct net *net)
 static struct pernet_operations nf_log_ipv4_net_ops = {
 	.init = nf_log_ipv4_net_init,
 	.exit = nf_log_ipv4_net_exit,
-	.async = true,
 };
 
 static int __init nf_log_ipv4_init(void)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 1f24bc8273a0..05e47d777009 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1204,7 +1204,6 @@ static void __net_exit ping_v4_proc_exit_net(struct net *net)
 static struct pernet_operations ping_v4_net_ops = {
 	.init = ping_v4_proc_init_net,
 	.exit = ping_v4_proc_exit_net,
-	.async = true,
 };
 
 int __init ping_proc_init(void)
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 80de2e659dcb..adfb75340275 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -549,7 +549,6 @@ static __net_exit void ip_proc_exit_net(struct net *net)
 static __net_initdata struct pernet_operations ip_proc_ops = {
 	.init = ip_proc_init_net,
 	.exit = ip_proc_exit_net,
-	.async = true,
 };
 
 int __init ip_misc_proc_init(void)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 0ee2501a9027..1b4d3355624a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1154,7 +1154,6 @@ static __net_exit void raw_exit_net(struct net *net)
 static __net_initdata struct pernet_operations raw_net_ops = {
 	.init = raw_init_net,
 	.exit = raw_exit_net,
-	.async = true,
 };
 
 int __init raw_proc_init(void)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ce9bd5380d21..8322e479f299 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -418,7 +418,6 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 	.init = ip_rt_do_proc_init,
 	.exit = ip_rt_do_proc_exit,
-	.async = true,
 };
 
 static int __init ip_rt_proc_init(void)
@@ -3017,7 +3016,6 @@ static __net_exit void sysctl_route_net_exit(struct net *net)
 static __net_initdata struct pernet_operations sysctl_route_ops = {
 	.init = sysctl_route_net_init,
 	.exit = sysctl_route_net_exit,
-	.async = true,
 };
 #endif
 
@@ -3031,7 +3029,6 @@ static __net_init int rt_genid_init(struct net *net)
 
 static __net_initdata struct pernet_operations rt_genid_ops = {
 	.init = rt_genid_init,
-	.async = true,
 };
 
 static int __net_init ipv4_inetpeer_init(struct net *net)
@@ -3057,7 +3054,6 @@ static void __net_exit ipv4_inetpeer_exit(struct net *net)
 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
 	.init	=	ipv4_inetpeer_init,
 	.exit	=	ipv4_inetpeer_exit,
-	.async	=	true,
 };
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5b72d97693f8..4b195bac8ac0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1219,7 +1219,6 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net)
 static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
 	.init = ipv4_sysctl_init_net,
 	.exit = ipv4_sysctl_exit_net,
-	.async = true,
 };
 
 static __init int sysctl_ipv4_init(void)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fec8b1fd7b63..9639334ebb7c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2391,7 +2391,6 @@ static void __net_exit tcp4_proc_exit_net(struct net *net)
 static struct pernet_operations tcp4_net_ops = {
 	.init = tcp4_proc_init_net,
 	.exit = tcp4_proc_exit_net,
-	.async = true,
 };
 
 int __init tcp4_proc_init(void)
@@ -2578,7 +2577,6 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
        .init	   = tcp_sk_init,
        .exit	   = tcp_sk_exit,
        .exit_batch = tcp_sk_exit_batch,
-       .async	   = true,
 };
 
 void __init tcp_v4_init(void)
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index aa6fea9f3328..03b51cdcc731 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -1024,7 +1024,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis
 static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
 	.init		=	tcp_net_metrics_init,
 	.exit_batch	=	tcp_net_metrics_exit_batch,
-	.async		=	true,
 };
 
 void __init tcp_metrics_init(void)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index fb8f3a36bd14..f49e14cd3891 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2756,7 +2756,6 @@ static void __net_exit udp4_proc_exit_net(struct net *net)
 static struct pernet_operations udp4_net_ops = {
 	.init = udp4_proc_init_net,
 	.exit = udp4_proc_exit_net,
-	.async = true,
 };
 
 int __init udp4_proc_init(void)
@@ -2843,7 +2842,6 @@ static int __net_init udp_sysctl_init(struct net *net)
 
 static struct pernet_operations __net_initdata udp_sysctl_ops = {
 	.init	= udp_sysctl_init,
-	.async	= true,
 };
 
 void __init udp_init(void)
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 72f2c3806408..f96614e9b9a5 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -104,7 +104,6 @@ static void __net_exit udplite4_proc_exit_net(struct net *net)
 static struct pernet_operations udplite4_net_ops = {
 	.init = udplite4_proc_init_net,
 	.exit = udplite4_proc_exit_net,
-	.async = true,
 };
 
 static __init int udplite4_proc_init(void)
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 6c76a757fa4a..d73a6d6652f6 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -367,7 +367,6 @@ static void __net_exit xfrm4_net_exit(struct net *net)
 static struct pernet_operations __net_initdata xfrm4_net_ops = {
 	.init	= xfrm4_net_init,
 	.exit	= xfrm4_net_exit,
-	.async	= true,
 };
 
 static void __init xfrm4_policy_init(void)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 189eac80f4ef..78cef00c9596 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4282,7 +4282,6 @@ static void __net_exit if6_proc_net_exit(struct net *net)
 static struct pernet_operations if6_proc_net_ops = {
 	.init = if6_proc_net_init,
 	.exit = if6_proc_net_exit,
-	.async = true,
 };
 
 int __init if6_proc_init(void)
@@ -6592,7 +6591,6 @@ static void __net_exit addrconf_exit_net(struct net *net)
 static struct pernet_operations addrconf_ops = {
 	.init = addrconf_init_net,
 	.exit = addrconf_exit_net,
-	.async = true,
 };
 
 static struct rtnl_af_ops inet6_ops __read_mostly = {
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index ba2e63633370..1d6ced37ad71 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -344,7 +344,6 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net)
 static struct pernet_operations ipv6_addr_label_ops = {
 	.init = ip6addrlbl_net_init,
 	.exit = ip6addrlbl_net_exit,
-	.async = true,
 };
 
 int __init ipv6_addr_label_init(void)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index dbbe04018813..c1e292db04db 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -857,7 +857,6 @@ static void __net_exit inet6_net_exit(struct net *net)
 static struct pernet_operations inet6_net_ops = {
 	.init = inet6_net_init,
 	.exit = inet6_net_exit,
-	.async = true,
 };
 
 static const struct ipv6_stub ipv6_stub_impl = {
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 00ef9467f3c0..df113c7b5fc8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -397,7 +397,6 @@ static void __net_exit fib6_rules_net_exit(struct net *net)
 static struct pernet_operations fib6_rules_net_ops = {
 	.init = fib6_rules_net_init,
 	.exit = fib6_rules_net_exit,
-	.async = true,
 };
 
 int __init fib6_rules_init(void)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 6f84668be6ea..d8c4b6374377 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -998,7 +998,6 @@ static void __net_exit icmpv6_sk_exit(struct net *net)
 static struct pernet_operations icmpv6_sk_ops = {
 	.init = icmpv6_sk_init,
 	.exit = icmpv6_sk_exit,
-	.async = true,
 };
 
 int __init icmpv6_init(void)
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index e438699f000f..44c39c5f0638 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -613,7 +613,6 @@ static struct pernet_operations ila_net_ops = {
 	.exit = ila_exit_net,
 	.id   = &ila_net_id,
 	.size = sizeof(struct ila_net),
-	.async = true,
 };
 
 static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 2f995e9e3050..908b8e5b615a 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -2161,7 +2161,6 @@ static void fib6_net_exit(struct net *net)
 static struct pernet_operations fib6_net_ops = {
 	.init = fib6_net_init,
 	.exit = fib6_net_exit,
-	.async = true,
 };
 
 int __init fib6_init(void)
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index f75b06ba8325..c05c4e82a7ca 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -873,7 +873,6 @@ static void __net_exit ip6_flowlabel_net_exit(struct net *net)
 static struct pernet_operations ip6_flowlabel_net_ops = {
 	.init = ip6_flowlabel_proc_init,
 	.exit = ip6_flowlabel_net_exit,
-	.async = true,
 };
 
 int ip6_flowlabel_init(void)
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 3a98c694da5f..22e86557aca4 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1528,7 +1528,6 @@ static struct pernet_operations ip6gre_net_ops = {
 	.exit_batch = ip6gre_exit_batch_net,
 	.id   = &ip6gre_net_id,
 	.size = sizeof(struct ip6gre_net),
-	.async = true,
 };
 
 static int ip6gre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 456fcf942f95..df4c29f7d59f 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -2260,7 +2260,6 @@ static struct pernet_operations ip6_tnl_net_ops = {
 	.exit_batch = ip6_tnl_exit_batch_net,
 	.id   = &ip6_tnl_net_id,
 	.size = sizeof(struct ip6_tnl_net),
-	.async = true,
 };
 
 /**
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index a482b854eeea..60b771f49fb5 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1148,7 +1148,6 @@ static struct pernet_operations vti6_net_ops = {
 	.exit_batch = vti6_exit_batch_net,
 	.id   = &vti6_net_id,
 	.size = sizeof(struct vti6_net),
-	.async = true,
 };
 
 static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 1c8fa29d155a..298fd8b6ed17 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1348,7 +1348,6 @@ static void __net_exit ip6mr_net_exit(struct net *net)
 static struct pernet_operations ip6mr_net_ops = {
 	.init = ip6mr_net_init,
 	.exit = ip6mr_net_exit,
-	.async = true,
 };
 
 int __init ip6_mr_init(void)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 1e4c2b6ebd78..793159d77d8a 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2997,7 +2997,6 @@ static void __net_exit igmp6_net_exit(struct net *net)
 static struct pernet_operations igmp6_net_ops = {
 	.init = igmp6_net_init,
 	.exit = igmp6_net_exit,
-	.async = true,
 };
 
 int __init igmp6_init(void)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index d1d0b2fa7a07..9de4dfb126ba 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1883,7 +1883,6 @@ static void __net_exit ndisc_net_exit(struct net *net)
 static struct pernet_operations ndisc_net_ops = {
 	.init = ndisc_net_init,
 	.exit = ndisc_net_exit,
-	.async = true,
 };
 
 int __init ndisc_init(void)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 4de8ac1e5af4..62358b93bbac 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1928,7 +1928,6 @@ static void __net_exit ip6_tables_net_exit(struct net *net)
 static struct pernet_operations ip6_tables_net_ops = {
 	.init = ip6_tables_net_init,
 	.exit = ip6_tables_net_exit,
-	.async = true,
 };
 
 static int __init ip6_tables_init(void)
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 06561c84c0bc..1343077dde93 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -87,7 +87,6 @@ static void __net_exit ip6table_filter_net_exit(struct net *net)
 static struct pernet_operations ip6table_filter_net_ops = {
 	.init = ip6table_filter_net_init,
 	.exit = ip6table_filter_net_exit,
-	.async = true,
 };
 
 static int __init ip6table_filter_init(void)
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index a11e25936b45..b0524b18c4fb 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -107,7 +107,6 @@ static void __net_exit ip6table_mangle_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_mangle_net_ops = {
 	.exit = ip6table_mangle_net_exit,
-	.async = true,
 };
 
 static int __init ip6table_mangle_init(void)
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 4475fd300bb6..47306e45a80a 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -131,7 +131,6 @@ static void __net_exit ip6table_nat_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_nat_net_ops = {
 	.exit	= ip6table_nat_net_exit,
-	.async	= true,
 };
 
 static int __init ip6table_nat_init(void)
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index a88f3b1995b1..710fa0806c37 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -75,7 +75,6 @@ static void __net_exit ip6table_raw_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_raw_net_ops = {
 	.exit = ip6table_raw_net_exit,
-	.async = true,
 };
 
 static int __init ip6table_raw_init(void)
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 320048c008dc..cf26ccb04056 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -74,7 +74,6 @@ static void __net_exit ip6table_security_net_exit(struct net *net)
 
 static struct pernet_operations ip6table_security_net_ops = {
 	.exit = ip6table_security_net_exit,
-	.async = true,
 };
 
 static int __init ip6table_security_init(void)
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index ba54bb3bd1e4..663827ee3cf8 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -401,7 +401,6 @@ static struct pernet_operations ipv6_net_ops = {
 	.exit = ipv6_net_exit,
 	.id = &conntrack6_net_id,
 	.size = sizeof(struct conntrack6_net),
-	.async = true,
 };
 
 static int __init nf_conntrack_l3proto_ipv6_init(void)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 34136fe80ed5..b84ce3e6d728 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -646,7 +646,6 @@ static void nf_ct_net_exit(struct net *net)
 static struct pernet_operations nf_ct_net_ops = {
 	.init = nf_ct_net_init,
 	.exit = nf_ct_net_exit,
-	.async = true,
 };
 
 int nf_ct_frag6_init(void)
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 32f98bc06900..c87b48359e8f 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -103,7 +103,6 @@ static void __net_exit defrag6_net_exit(struct net *net)
 
 static struct pernet_operations defrag6_net_ops = {
 	.exit = defrag6_net_exit,
-	.async = true,
 };
 
 static int __init nf_defrag_init(void)
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
index 0220e584589c..b397a8fe88b9 100644
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -390,7 +390,6 @@ static void __net_exit nf_log_ipv6_net_exit(struct net *net)
 static struct pernet_operations nf_log_ipv6_net_ops = {
 	.init = nf_log_ipv6_net_init,
 	.exit = nf_log_ipv6_net_exit,
-	.async = true,
 };
 
 static int __init nf_log_ipv6_init(void)
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 318c6e914234..d12c55dad7d1 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -240,7 +240,6 @@ static void __net_init ping_v6_proc_exit_net(struct net *net)
 static struct pernet_operations ping_v6_net_ops = {
 	.init = ping_v6_proc_init_net,
 	.exit = ping_v6_proc_exit_net,
-	.async = true,
 };
 #endif
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index f9891fa672f3..6e57028d2e91 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -343,7 +343,6 @@ static void __net_exit ipv6_proc_exit_net(struct net *net)
 static struct pernet_operations ipv6_proc_ops = {
 	.init = ipv6_proc_init_net,
 	.exit = ipv6_proc_exit_net,
-	.async = true,
 };
 
 int __init ipv6_misc_proc_init(void)
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index b5e5de732494..5eb9b08947ed 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1332,7 +1332,6 @@ static void __net_exit raw6_exit_net(struct net *net)
 static struct pernet_operations raw6_net_ops = {
 	.init = raw6_init_net,
 	.exit = raw6_exit_net,
-	.async = true,
 };
 
 int __init raw6_proc_init(void)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b5da69c83123..afbc000ad4f2 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -733,7 +733,6 @@ static void __net_exit ipv6_frags_exit_net(struct net *net)
 static struct pernet_operations ip6_frags_ops = {
 	.init = ipv6_frags_init_net,
 	.exit = ipv6_frags_exit_net,
-	.async = true,
 };
 
 int __init ipv6_frag_init(void)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1d0eaa69874d..ba8d5df50ebe 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5083,7 +5083,6 @@ static void __net_exit ip6_route_net_exit_late(struct net *net)
 static struct pernet_operations ip6_route_net_ops = {
 	.init = ip6_route_net_init,
 	.exit = ip6_route_net_exit,
-	.async = true,
 };
 
 static int __net_init ipv6_inetpeer_init(struct net *net)
@@ -5109,13 +5108,11 @@ static void __net_exit ipv6_inetpeer_exit(struct net *net)
 static struct pernet_operations ipv6_inetpeer_ops = {
 	.init	=	ipv6_inetpeer_init,
 	.exit	=	ipv6_inetpeer_exit,
-	.async	=	true,
 };
 
 static struct pernet_operations ip6_route_net_late_ops = {
 	.init = ip6_route_net_init_late,
 	.exit = ip6_route_net_exit_late,
-	.async = true,
 };
 
 static struct notifier_block ip6_route_dev_notifier = {
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index c3f13c3bd8a9..7f5621d09571 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -395,7 +395,6 @@ static void __net_exit seg6_net_exit(struct net *net)
 static struct pernet_operations ip6_segments_ops = {
 	.init = seg6_net_init,
 	.exit = seg6_net_exit,
-	.async = true,
 };
 
 static const struct genl_ops seg6_genl_ops[] = {
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 8a4f8fddd812..1522bcfd253f 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1888,7 +1888,6 @@ static struct pernet_operations sit_net_ops = {
 	.exit_batch = sit_exit_batch_net,
 	.id   = &sit_net_id,
 	.size = sizeof(struct sit_net),
-	.async = true,
 };
 
 static void __exit sit_cleanup(void)
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 966c42af92f4..6fbdef630152 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -278,7 +278,6 @@ static void __net_exit ipv6_sysctl_net_exit(struct net *net)
 static struct pernet_operations ipv6_sysctl_net_ops = {
 	.init = ipv6_sysctl_net_init,
 	.exit = ipv6_sysctl_net_exit,
-	.async = true,
 };
 
 static struct ctl_table_header *ip6_header;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5425d7b100ee..883df0ad5bfe 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2007,7 +2007,6 @@ static struct pernet_operations tcpv6_net_ops = {
 	.init	    = tcpv6_net_init,
 	.exit	    = tcpv6_net_exit,
 	.exit_batch = tcpv6_net_exit_batch,
-	.async	    = true,
 };
 
 int __init tcpv6_init(void)
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index f3839780dc31..14ae32bb1f3d 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -123,7 +123,6 @@ static void __net_exit udplite6_proc_exit_net(struct net *net)
 static struct pernet_operations udplite6_net_ops = {
 	.init = udplite6_proc_init_net,
 	.exit = udplite6_proc_exit_net,
-	.async = true,
 };
 
 int __init udplite6_proc_init(void)
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index cbb270bd81b0..416fe67271a9 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -400,7 +400,6 @@ static void __net_exit xfrm6_net_exit(struct net *net)
 static struct pernet_operations xfrm6_net_ops = {
 	.init	= xfrm6_net_init,
 	.exit	= xfrm6_net_exit,
-	.async	= true,
 };
 
 int __init xfrm6_init(void)
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index a9673619e0e9..f85f0d7480ac 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -353,7 +353,6 @@ static struct pernet_operations xfrm6_tunnel_net_ops = {
 	.exit	= xfrm6_tunnel_net_exit,
 	.id	= &xfrm6_tunnel_net_id,
 	.size	= sizeof(struct xfrm6_tunnel_net),
-	.async	= true,
 };
 
 static int __init xfrm6_tunnel_init(void)
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 4c2e9907f254..1fac92543094 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -433,7 +433,6 @@ static void kcm_proc_exit_net(struct net *net)
 static struct pernet_operations kcm_net_ops = {
 	.init = kcm_proc_init_net,
 	.exit = kcm_proc_exit_net,
-	.async = true,
 };
 
 int __init kcm_proc_init(void)
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 516cfad71b85..dc76bc346829 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -2028,7 +2028,6 @@ static struct pernet_operations kcm_net_ops = {
 	.exit = kcm_exit_net,
 	.id   = &kcm_net_id,
 	.size = sizeof(struct kcm_net),
-	.async = true,
 };
 
 static int __init kcm_init(void)
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 3ac08ab26207..7e2e7188e7f4 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3863,7 +3863,6 @@ static struct pernet_operations pfkey_net_ops = {
 	.exit = pfkey_net_exit,
 	.id   = &pfkey_net_id,
 	.size = sizeof(struct netns_pfkey),
-	.async = true,
 };
 
 static void __exit ipsec_pfkey_exit(void)
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index b86868da50d4..14b67dfacc4b 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1789,7 +1789,6 @@ static struct pernet_operations l2tp_net_ops = {
 	.exit = l2tp_exit_net,
 	.id   = &l2tp_net_id,
 	.size = sizeof(struct l2tp_net),
-	.async = true,
 };
 
 static int __init l2tp_init(void)
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index f24504efe729..d6deca11da19 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1762,7 +1762,6 @@ static struct pernet_operations pppol2tp_net_ops = {
 	.init = pppol2tp_init_net,
 	.exit = pppol2tp_exit_net,
 	.id   = &pppol2tp_net_id,
-	.async = true,
 };
 
 /*****************************************************************************
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index d4a89a8be013..7a4de6d618b1 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -2488,7 +2488,6 @@ static void mpls_net_exit(struct net *net)
 static struct pernet_operations mpls_net_ops = {
 	.init = mpls_net_init,
 	.exit = mpls_net_exit,
-	.async = true,
 };
 
 static struct rtnl_af_ops mpls_af_ops __read_mostly = {
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index d72cc786c7b7..0f6b8172fb9a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -629,7 +629,6 @@ static void __net_exit netfilter_net_exit(struct net *net)
 static struct pernet_operations netfilter_net_ops = {
 	.init = netfilter_net_init,
 	.exit = netfilter_net_exit,
-	.async = true,
 };
 
 int __init netfilter_init(void)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 2523ebe2b3cc..bc4bd247bb7d 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -2095,7 +2095,6 @@ static struct pernet_operations ip_set_net_ops = {
 	.exit   = ip_set_net_exit,
 	.id	= &ip_set_net_id,
 	.size	= sizeof(struct ip_set_net),
-	.async	= true,
 };
 
 static int __init
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 6a6cb9db030b..5f6f73cf2174 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -2289,12 +2289,10 @@ static struct pernet_operations ipvs_core_ops = {
 	.exit = __ip_vs_cleanup,
 	.id   = &ip_vs_net_id,
 	.size = sizeof(struct netns_ipvs),
-	.async = true,
 };
 
 static struct pernet_operations ipvs_core_dev_ops = {
 	.exit = __ip_vs_dev_cleanup,
-	.async = true,
 };
 
 /*
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 8b25aab41928..58d5d05aec24 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -479,7 +479,6 @@ static void __ip_vs_ftp_exit(struct net *net)
 static struct pernet_operations ip_vs_ftp_ops = {
 	.init = __ip_vs_ftp_init,
 	.exit = __ip_vs_ftp_exit,
-	.async = true,
 };
 
 static int __init ip_vs_ftp_init(void)
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 6a340c94c4b8..d625179de485 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -604,7 +604,6 @@ static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
 static struct pernet_operations ip_vs_lblc_ops = {
 	.init = __ip_vs_lblc_init,
 	.exit = __ip_vs_lblc_exit,
-	.async = true,
 };
 
 static int __init ip_vs_lblc_init(void)
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 0627881128da..84c57b62a588 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -789,7 +789,6 @@ static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
 static struct pernet_operations ip_vs_lblcr_ops = {
 	.init = __ip_vs_lblcr_init,
 	.exit = __ip_vs_lblcr_exit,
-	.async = true,
 };
 
 static int __init ip_vs_lblcr_init(void)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 8884d302d33a..dd177ebee9aa 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -3417,7 +3417,6 @@ static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list)
 static struct pernet_operations ctnetlink_net_ops = {
 	.init		= ctnetlink_net_init,
 	.exit_batch	= ctnetlink_net_exit_batch,
-	.async		= true,
 };
 
 static int __init ctnetlink_init(void)
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 9bcd72fe91f9..d049ea5a3770 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -406,7 +406,6 @@ static struct pernet_operations proto_gre_net_ops = {
 	.exit = proto_gre_net_exit,
 	.id   = &proto_gre_net_id,
 	.size = sizeof(struct netns_proto_gre),
-	.async = true,
 };
 
 static int __init nf_ct_proto_gre_init(void)
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 98844c87d01e..037fec54c850 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -705,7 +705,6 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
 static struct pernet_operations nf_conntrack_net_ops = {
 	.init		= nf_conntrack_pernet_init,
 	.exit_batch	= nf_conntrack_pernet_exit,
-	.async		= true,
 };
 
 static int __init nf_conntrack_standalone_init(void)
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index a964e4d356cc..6d0357817cda 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -577,7 +577,6 @@ static void __net_exit nf_log_net_exit(struct net *net)
 static struct pernet_operations nf_log_net_ops = {
 	.init = nf_log_net_init,
 	.exit = nf_log_net_exit,
-	.async = true,
 };
 
 int __init netfilter_log_init(void)
diff --git a/net/netfilter/nf_log_netdev.c b/net/netfilter/nf_log_netdev.c
index 254c2c6bde48..350eb147754d 100644
--- a/net/netfilter/nf_log_netdev.c
+++ b/net/netfilter/nf_log_netdev.c
@@ -47,7 +47,6 @@ static void __net_exit nf_log_netdev_net_exit(struct net *net)
 static struct pernet_operations nf_log_netdev_net_ops = {
 	.init = nf_log_netdev_net_init,
 	.exit = nf_log_netdev_net_exit,
-	.async = true,
 };
 
 static int __init nf_log_netdev_init(void)
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 8f16fd27132d..6039b350abbe 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -398,7 +398,6 @@ static struct pernet_operations synproxy_net_ops = {
 	.exit		= synproxy_net_exit,
 	.id		= &synproxy_net_id,
 	.size		= sizeof(struct synproxy_net),
-	.async		= true,
 };
 
 static int __init synproxy_core_init(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fd13d28e4ca7..c4acc7340eb1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6597,7 +6597,6 @@ static void __net_exit nf_tables_exit_net(struct net *net)
 static struct pernet_operations nf_tables_net_ops = {
 	.init	= nf_tables_init_net,
 	.exit	= nf_tables_exit_net,
-	.async	= true,
 };
 
 static int __init nf_tables_module_init(void)
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 84fc4954862d..03ead8a9e90c 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -566,7 +566,6 @@ static void __net_exit nfnetlink_net_exit_batch(struct list_head *net_exit_list)
 static struct pernet_operations nfnetlink_net_ops = {
 	.init		= nfnetlink_net_init,
 	.exit_batch	= nfnetlink_net_exit_batch,
-	.async		= true,
 };
 
 static int __init nfnetlink_init(void)
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index 8d9f18bb8840..88d427f9f9e6 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -515,7 +515,6 @@ static void __net_exit nfnl_acct_net_exit(struct net *net)
 static struct pernet_operations nfnl_acct_ops = {
         .init   = nfnl_acct_net_init,
         .exit   = nfnl_acct_net_exit,
-	.async	= true,
 };
 
 static int __init nfnl_acct_init(void)
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 6819300f7fb7..95b04702a655 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -586,7 +586,6 @@ static void __net_exit cttimeout_net_exit(struct net *net)
 static struct pernet_operations cttimeout_ops = {
 	.init	= cttimeout_net_init,
 	.exit	= cttimeout_net_exit,
-	.async	= true,
 };
 
 static int __init cttimeout_init(void)
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index b21ef79849a1..7b46aa4c478d 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1108,7 +1108,6 @@ static struct pernet_operations nfnl_log_net_ops = {
 	.exit	= nfnl_log_net_exit,
 	.id	= &nfnl_log_net_id,
 	.size	= sizeof(struct nfnl_log_net),
-	.async	= true,
 };
 
 static int __init nfnetlink_log_init(void)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 9f572ed56208..0b839c38800f 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1525,7 +1525,6 @@ static struct pernet_operations nfnl_queue_net_ops = {
 	.exit_batch	= nfnl_queue_net_exit_batch,
 	.id		= &nfnl_queue_net_id,
 	.size		= sizeof(struct nfnl_queue_net),
-	.async		= true,
 };
 
 static int __init nfnetlink_queue_init(void)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 6de1f6a4cb80..4aa01c90e9d1 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1789,7 +1789,6 @@ static void __net_exit xt_net_exit(struct net *net)
 static struct pernet_operations xt_net_ops = {
 	.init = xt_net_init,
 	.exit = xt_net_exit,
-	.async = true,
 };
 
 static int __init xt_init(void)
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index ef65b7a9173e..3360f13dc208 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -1349,7 +1349,6 @@ static struct pernet_operations hashlimit_net_ops = {
 	.exit	= hashlimit_net_exit,
 	.id	= &hashlimit_net_id,
 	.size	= sizeof(struct hashlimit_net),
-	.async	= true,
 };
 
 static int __init hashlimit_mt_init(void)
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 434e35ce940b..9bbfc17ce3ec 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -687,7 +687,6 @@ static struct pernet_operations recent_net_ops = {
 	.exit	= recent_net_exit,
 	.id	= &recent_net_id,
 	.size	= sizeof(struct recent_net),
-	.async	= true,
 };
 
 static struct xt_match recent_mt_reg[] __read_mostly = {
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5d10dcfe6411..f1b02d87e336 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -253,7 +253,6 @@ static struct pernet_operations netlink_tap_net_ops = {
 	.exit = netlink_tap_exit_net,
 	.id   = &netlink_tap_net_id,
 	.size = sizeof(struct netlink_tap_net),
-	.async = true,
 };
 
 static bool netlink_filter_tap(const struct sk_buff *skb)
@@ -2726,7 +2725,6 @@ static void __init netlink_add_usersock_entry(void)
 static struct pernet_operations __net_initdata netlink_net_ops = {
 	.init = netlink_net_init,
 	.exit = netlink_net_exit,
-	.async = true,
 };
 
 static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index af51b8c0a2cb..b9ce82c9440f 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1035,7 +1035,6 @@ static void __net_exit genl_pernet_exit(struct net *net)
 static struct pernet_operations genl_pernet_ops = {
 	.init = genl_pernet_init,
 	.exit = genl_pernet_exit,
-	.async = true,
 };
 
 static int __init genl_init(void)
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 100191df0371..ef38e5aecd28 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2384,7 +2384,6 @@ static struct pernet_operations ovs_net_ops = {
 	.exit = ovs_exit_net,
 	.id   = &ovs_net_id,
 	.size = sizeof(struct ovs_net),
-	.async = true,
 };
 
 static int __init dp_init(void)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2c5a6fe5d749..616cb9c18f88 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4557,7 +4557,6 @@ static void __net_exit packet_net_exit(struct net *net)
 static struct pernet_operations packet_net_ops = {
 	.init = packet_net_init,
 	.exit = packet_net_exit,
-	.async = true,
 };
 
 
diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c
index 9454e8393793..77787512fc32 100644
--- a/net/phonet/pn_dev.c
+++ b/net/phonet/pn_dev.c
@@ -342,7 +342,6 @@ static struct pernet_operations phonet_net_ops = {
 	.exit = phonet_exit_net,
 	.id   = &phonet_net_id,
 	.size = sizeof(struct phonet_net),
-	.async = true,
 };
 
 /* Initialize Phonet devices list */
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 4f3a32c38bf5..351a28474667 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -530,7 +530,6 @@ static struct pernet_operations rds_tcp_net_ops = {
 	.exit = rds_tcp_exit_net,
 	.id = &rds_tcp_netid,
 	.size = sizeof(struct rds_tcp_net),
-	.async = true,
 };
 
 void *rds_tcp_listen_sock_def_readable(struct net *net)
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 5fd939dabf41..f18c9248e0d4 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -106,5 +106,4 @@ struct pernet_operations rxrpc_net_ops = {
 	.exit	= rxrpc_exit_net,
 	.id	= &rxrpc_net_id,
 	.size	= sizeof(struct rxrpc_net),
-	.async	= true,
 };
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 7bd1b964f021..0d78b58e1898 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1533,7 +1533,6 @@ static struct pernet_operations tcf_action_net_ops = {
 	.exit = tcf_action_net_exit,
 	.id = &tcf_action_net_id,
 	.size = sizeof(struct tcf_action_net),
-	.async = true,
 };
 
 static int __init tc_action_init(void)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 5cb9b268e8ff..9092531d45d8 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -413,7 +413,6 @@ static struct pernet_operations bpf_net_ops = {
 	.exit_batch = bpf_exit_net,
 	.id   = &bpf_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int __init bpf_init_module(void)
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 371e5e4ab3e2..e4b880fa51fe 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -222,7 +222,6 @@ static struct pernet_operations connmark_net_ops = {
 	.exit_batch = connmark_exit_net,
 	.id   = &connmark_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int __init connmark_init_module(void)
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index a527e287c086..7e28b2ce1437 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -678,7 +678,6 @@ static struct pernet_operations csum_net_ops = {
 	.exit_batch = csum_exit_net,
 	.id   = &csum_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_DESCRIPTION("Checksum updating actions");
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 88fbb8403565..4dc4f153cad8 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -261,7 +261,6 @@ static struct pernet_operations gact_net_ops = {
 	.exit_batch = gact_exit_net,
 	.id   = &gact_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 555b1caeff72..a5994cf0512b 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -870,7 +870,6 @@ static struct pernet_operations ife_net_ops = {
 	.exit_batch = ife_exit_net,
 	.id   = &ife_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int __init ife_init_module(void)
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index b5e8565b89c7..14c312d7908f 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -352,7 +352,6 @@ static struct pernet_operations ipt_net_ops = {
 	.exit_batch = ipt_exit_net,
 	.id   = &ipt_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
@@ -403,7 +402,6 @@ static struct pernet_operations xt_net_ops = {
 	.exit_batch = xt_exit_net,
 	.id   = &xt_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 64c86579c3d9..fd34015331ab 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -353,7 +353,6 @@ static struct pernet_operations mirred_net_ops = {
 	.exit_batch = mirred_exit_net,
 	.id   = &mirred_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002)");
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index b1bc757f6491..4b5848b6c252 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -323,7 +323,6 @@ static struct pernet_operations nat_net_ops = {
 	.exit_batch = nat_exit_net,
 	.id   = &nat_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_DESCRIPTION("Stateless NAT actions");
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index f392ccaaa0d8..8a925c72db5f 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -465,7 +465,6 @@ static struct pernet_operations pedit_net_ops = {
 	.exit_batch = pedit_exit_net,
 	.id   = &pedit_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 7081ec75e696..4e72bc2a0dfb 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -347,7 +347,6 @@ static struct pernet_operations police_net_ops = {
 	.exit_batch = police_exit_net,
 	.id   = &police_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int __init police_init_module(void)
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 3a89f98f17e6..5db358497c9e 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -249,7 +249,6 @@ static struct pernet_operations sample_net_ops = {
 	.exit_batch = sample_exit_net,
 	.id   = &sample_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int __init sample_init_module(void)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index e84768ae610a..9618b4a83cee 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -216,7 +216,6 @@ static struct pernet_operations simp_net_ops = {
 	.exit_batch = simp_exit_net,
 	.id   = &simp_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim(2005)");
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 7971510fe61b..ddf69fc01bdf 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -253,7 +253,6 @@ static struct pernet_operations skbedit_net_ops = {
 	.exit_batch = skbedit_exit_net,
 	.id   = &skbedit_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index 142a996ac776..bbcbdce732cc 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -279,7 +279,6 @@ static struct pernet_operations skbmod_net_ops = {
 	.exit_batch = skbmod_exit_net,
 	.id   = &skbmod_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index a1c8dd406a04..626dac81a48a 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -339,7 +339,6 @@ static struct pernet_operations tunnel_key_net_ops = {
 	.exit_batch = tunnel_key_exit_net,
 	.id   = &tunnel_key_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int __init tunnel_key_init_module(void)
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 41a66effeb5f..853604685965 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -314,7 +314,6 @@ static struct pernet_operations vlan_net_ops = {
 	.exit_batch = vlan_exit_net,
 	.id   = &vlan_net_id,
 	.size = sizeof(struct tc_action_net),
-	.async = true,
 };
 
 static int __init vlan_init_module(void)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ec5fe8ec0c3e..b66754f52a9f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1619,7 +1619,6 @@ static struct pernet_operations tcf_net_ops = {
 	.exit = tcf_net_exit,
 	.id   = &tcf_net_id,
 	.size = sizeof(struct tcf_net),
-	.async = true,
 };
 
 static int __init tc_filter_init(void)
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 68f9d942bed4..106dae7e4818 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -2133,7 +2133,6 @@ static void __net_exit psched_net_exit(struct net *net)
 static struct pernet_operations psched_net_ops = {
 	.init = psched_net_init,
 	.exit = psched_net_exit,
-	.async = true,
 };
 
 static int __init pktsched_init(void)
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 493b817f6a2a..84a09f599131 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1283,7 +1283,6 @@ static void __net_exit sctp_defaults_exit(struct net *net)
 static struct pernet_operations sctp_defaults_ops = {
 	.init = sctp_defaults_init,
 	.exit = sctp_defaults_exit,
-	.async = true,
 };
 
 static int __net_init sctp_ctrlsock_init(struct net *net)
@@ -1307,7 +1306,6 @@ static void __net_init sctp_ctrlsock_exit(struct net *net)
 static struct pernet_operations sctp_ctrlsock_ops = {
 	.init = sctp_ctrlsock_init,
 	.exit = sctp_ctrlsock_exit,
-	.async = true,
 };
 
 /* Initialize the universe into something sensible.  */
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 44f939cb6bc8..9463af4b32e8 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -2063,7 +2063,6 @@ static __net_exit void rpcsec_gss_exit_net(struct net *net)
 static struct pernet_operations rpcsec_gss_net_ops = {
 	.init = rpcsec_gss_init_net,
 	.exit = rpcsec_gss_exit_net,
-	.async = true,
 };
 
 /*
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 68287e921847..56f9eff74150 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -79,7 +79,6 @@ static struct pernet_operations sunrpc_net_ops = {
 	.exit = sunrpc_exit_net,
 	.id = &sunrpc_net_id,
 	.size = sizeof(struct sunrpc_net),
-	.async = true,
 };
 
 static int __init
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index f424539829b7..9aed6fe1bf1a 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -89,7 +89,6 @@ static void __net_exit sysctl_net_exit(struct net *net)
 static struct pernet_operations sysctl_pernet_ops = {
 	.init = sysctl_net_init,
 	.exit = sysctl_net_exit,
-	.async = true,
 };
 
 static struct ctl_table_header *net_header;
diff --git a/net/tipc/core.c b/net/tipc/core.c
index 52dfc51ac4d5..5b38f5164281 100644
--- a/net/tipc/core.c
+++ b/net/tipc/core.c
@@ -109,7 +109,6 @@ static struct pernet_operations tipc_net_ops = {
 	.exit = tipc_exit_net,
 	.id   = &tipc_net_id,
 	.size = sizeof(struct tipc_net),
-	.async = true,
 };
 
 static int __init tipc_init(void)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index bc2970a8e7f3..aded82da1aea 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2913,7 +2913,6 @@ static void __net_exit unix_net_exit(struct net *net)
 static struct pernet_operations unix_net_ops = {
 	.init = unix_net_init,
 	.exit = unix_net_exit,
-	.async = true,
 };
 
 static int __init af_unix_init(void)
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 670aa229168a..a6f3cac8c640 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1340,7 +1340,6 @@ static void __net_exit cfg80211_pernet_exit(struct net *net)
 
 static struct pernet_operations cfg80211_pernet_ops = {
 	.exit = cfg80211_pernet_exit,
-	.async = true,
 };
 
 static int __init cfg80211_init(void)
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index bc7064486b15..9efbfc753347 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -390,7 +390,6 @@ static void __net_exit wext_pernet_exit(struct net *net)
 static struct pernet_operations wext_pernet_ops = {
 	.init = wext_pernet_init,
 	.exit = wext_pernet_exit,
-	.async = true,
 };
 
 static int __init wireless_nlevent_init(void)
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index cb3bb9ae4407..625b3fca5704 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2985,7 +2985,6 @@ static void __net_exit xfrm_net_exit(struct net *net)
 static struct pernet_operations __net_initdata xfrm_net_ops = {
 	.init = xfrm_net_init,
 	.exit = xfrm_net_exit,
-	.async = true,
 };
 
 void __init xfrm_init(void)
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index e92b8c019c88..080035f056d9 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -3253,7 +3253,6 @@ static void __net_exit xfrm_user_net_exit(struct list_head *net_exit_list)
 static struct pernet_operations xfrm_user_net_ops = {
 	.init	    = xfrm_user_net_init,
 	.exit_batch = xfrm_user_net_exit,
-	.async	    = true,
 };
 
 static int __init xfrm_user_init(void)
-- 
cgit v1.2.3


From 4420bf21fb6c0306e36ad58ade1e741fba57ce65 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 27 Mar 2018 18:02:23 +0300
Subject: net: Rename net_sem to pernet_ops_rwsem

net_sem is some undefined area name, so it will be better
to make the area more defined.

Rename it to pernet_ops_rwsem for better readability and
better intelligibility.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 40 ++++++++++++++++++++--------------------
 net/core/rtnetlink.c     |  4 ++--
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index eef17ad29dea..9e8ee4640451 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -41,10 +41,10 @@ EXPORT_SYMBOL(init_net);
 
 static bool init_net_initialized;
 /*
- * net_sem: protects: pernet_list, net_generic_ids,
+ * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
  * init_net_initialized and first_device pointer.
  */
-DECLARE_RWSEM(net_sem);
+DECLARE_RWSEM(pernet_ops_rwsem);
 
 #define MIN_PERNET_OPS_ID	\
 	((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -72,7 +72,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
 	BUG_ON(id < MIN_PERNET_OPS_ID);
 
 	old_ng = rcu_dereference_protected(net->gen,
-					   lockdep_is_held(&net_sem));
+					   lockdep_is_held(&pernet_ops_rwsem));
 	if (old_ng->s.len > id) {
 		old_ng->ptr[id] = data;
 		return 0;
@@ -289,7 +289,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
  */
 static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 {
-	/* Must be called with net_sem held */
+	/* Must be called with pernet_ops_rwsem held */
 	const struct pernet_operations *ops, *saved_ops;
 	int error = 0;
 	LIST_HEAD(net_exit_list);
@@ -422,13 +422,13 @@ struct net *copy_net_ns(unsigned long flags,
 	net->ucounts = ucounts;
 	get_user_ns(user_ns);
 
-	rv = down_read_killable(&net_sem);
+	rv = down_read_killable(&pernet_ops_rwsem);
 	if (rv < 0)
 		goto put_userns;
 
 	rv = setup_net(net, user_ns);
 
-	up_read(&net_sem);
+	up_read(&pernet_ops_rwsem);
 
 	if (rv < 0) {
 put_userns:
@@ -480,7 +480,7 @@ static void cleanup_net(struct work_struct *work)
 	/* Atomically snapshot the list of namespaces to cleanup */
 	net_kill_list = llist_del_all(&cleanup_list);
 
-	down_read(&net_sem);
+	down_read(&pernet_ops_rwsem);
 
 	/* Don't let anyone else find us. */
 	rtnl_lock();
@@ -519,7 +519,7 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
 
-	up_read(&net_sem);
+	up_read(&pernet_ops_rwsem);
 
 	/* Ensure there are no outstanding rcu callbacks using this
 	 * network namespace.
@@ -546,8 +546,8 @@ static void cleanup_net(struct work_struct *work)
  */
 void net_ns_barrier(void)
 {
-	down_write(&net_sem);
-	up_write(&net_sem);
+	down_write(&pernet_ops_rwsem);
+	up_write(&pernet_ops_rwsem);
 }
 EXPORT_SYMBOL(net_ns_barrier);
 
@@ -869,12 +869,12 @@ static int __init net_ns_init(void)
 
 	rcu_assign_pointer(init_net.gen, ng);
 
-	down_write(&net_sem);
+	down_write(&pernet_ops_rwsem);
 	if (setup_net(&init_net, &init_user_ns))
 		panic("Could not setup the initial network namespace");
 
 	init_net_initialized = true;
-	up_write(&net_sem);
+	up_write(&pernet_ops_rwsem);
 
 	register_pernet_subsys(&net_ns_ops);
 
@@ -1013,9 +1013,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
 int register_pernet_subsys(struct pernet_operations *ops)
 {
 	int error;
-	down_write(&net_sem);
+	down_write(&pernet_ops_rwsem);
 	error =  register_pernet_operations(first_device, ops);
-	up_write(&net_sem);
+	up_write(&pernet_ops_rwsem);
 	return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -1031,9 +1031,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
  */
 void unregister_pernet_subsys(struct pernet_operations *ops)
 {
-	down_write(&net_sem);
+	down_write(&pernet_ops_rwsem);
 	unregister_pernet_operations(ops);
-	up_write(&net_sem);
+	up_write(&pernet_ops_rwsem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 
@@ -1059,11 +1059,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
 int register_pernet_device(struct pernet_operations *ops)
 {
 	int error;
-	down_write(&net_sem);
+	down_write(&pernet_ops_rwsem);
 	error = register_pernet_operations(&pernet_list, ops);
 	if (!error && (first_device == &pernet_list))
 		first_device = &ops->list;
-	up_write(&net_sem);
+	up_write(&pernet_ops_rwsem);
 	return error;
 }
 EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1079,11 +1079,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
  */
 void unregister_pernet_device(struct pernet_operations *ops)
 {
-	down_write(&net_sem);
+	down_write(&pernet_ops_rwsem);
 	if (&ops->list == first_device)
 		first_device = first_device->next;
 	unregister_pernet_operations(ops);
-	up_write(&net_sem);
+	up_write(&pernet_ops_rwsem);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 31438b63d4b4..73011a60434c 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -460,11 +460,11 @@ static void rtnl_lock_unregistering_all(void)
 void rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
 	/* Close the race with cleanup_net() */
-	down_write(&net_sem);
+	down_write(&pernet_ops_rwsem);
 	rtnl_lock_unregistering_all();
 	__rtnl_link_unregister(ops);
 	rtnl_unlock();
-	up_write(&net_sem);
+	up_write(&pernet_ops_rwsem);
 }
 EXPORT_SYMBOL_GPL(rtnl_link_unregister);
 
-- 
cgit v1.2.3


From 8518e9bb98b602eca0717d5aaad63ccbe56539d2 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 27 Mar 2018 18:02:32 +0300
Subject: net: Add more comments

This adds comments to different places to improve
readability.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 2 ++
 net/core/rtnetlink.c     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 9e8ee4640451..b5796d17a302 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -43,6 +43,8 @@ static bool init_net_initialized;
 /*
  * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
  * init_net_initialized and first_device pointer.
+ * This is internal net namespace object. Please, don't use it
+ * outside.
  */
 DECLARE_RWSEM(pernet_ops_rwsem);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 73011a60434c..2d3949789cef 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -459,7 +459,7 @@ static void rtnl_lock_unregistering_all(void)
  */
 void rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
-	/* Close the race with cleanup_net() */
+	/* Close the race with setup_net() and cleanup_net() */
 	down_write(&pernet_ops_rwsem);
 	rtnl_lock_unregistering_all();
 	__rtnl_link_unregister(ops);
-- 
cgit v1.2.3


From 827efed6a66ef8a1c071400b5952fee4a5ffedf9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 27 Mar 2018 23:02:47 +0100
Subject: rxrpc: Trace resend

Add a tracepoint to trace packet resend events and to dump the Tx
annotation buffer for added illumination.

Signed-off-by: David Howells <dhowells@rdhat.com>
---
 net/rxrpc/call_event.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index ad2ab1103189..6a62e42e1d8d 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -195,6 +195,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 	 * the packets in the Tx buffer we're going to resend and what the new
 	 * resend timeout will be.
 	 */
+	trace_rxrpc_resend(call, (cursor + 1) & RXRPC_RXTX_BUFF_MASK);
 	oldest = now;
 	for (seq = cursor + 1; before_eq(seq, top); seq++) {
 		ix = seq & RXRPC_RXTX_BUFF_MASK;
-- 
cgit v1.2.3


From a25e21f0bcd25673b91b97b9805db33350feec0f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 27 Mar 2018 23:03:00 +0100
Subject: rxrpc, afs: Use debug_ids rather than pointers in traces

In rxrpc and afs, use the debug_ids that are monotonically allocated to
various objects as they're allocated rather than pointers as kernel
pointers are now hashed making them less useful.  Further, the debug ids
aren't reused anywhere nearly as quickly.

In addition, allow kernel services that use rxrpc, such as afs, to take
numbers from the rxrpc counter, assign them to their own call struct and
pass them in to rxrpc for both client and service calls so that the trace
lines for each will have the same ID tag.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    |  7 +++++--
 net/rxrpc/ar-internal.h |  8 ++++----
 net/rxrpc/call_accept.c | 18 +++++++++++-------
 net/rxrpc/call_object.c | 15 +++++++++------
 net/rxrpc/conn_event.c  |  3 ++-
 net/rxrpc/input.c       |  6 +++---
 net/rxrpc/sendmsg.c     |  3 ++-
 7 files changed, 36 insertions(+), 24 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 9e1c2c6b6a67..ec5ec68be1aa 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -40,6 +40,7 @@ static const struct proto_ops rxrpc_rpc_ops;
 
 /* current debugging ID */
 atomic_t rxrpc_debug_id;
+EXPORT_SYMBOL(rxrpc_debug_id);
 
 /* count of skbs currently in use */
 atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
@@ -267,6 +268,7 @@ static int rxrpc_listen(struct socket *sock, int backlog)
  * @gfp: The allocation constraints
  * @notify_rx: Where to send notifications instead of socket queue
  * @upgrade: Request service upgrade for call
+ * @debug_id: The debug ID for tracing to be assigned to the call
  *
  * Allow a kernel service to begin a call on the nominated socket.  This just
  * sets up all the internal tracking structures and allocates connection and
@@ -282,7 +284,8 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 					   s64 tx_total_len,
 					   gfp_t gfp,
 					   rxrpc_notify_rx_t notify_rx,
-					   bool upgrade)
+					   bool upgrade,
+					   unsigned int debug_id)
 {
 	struct rxrpc_conn_parameters cp;
 	struct rxrpc_call_params p;
@@ -314,7 +317,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 	cp.exclusive		= false;
 	cp.upgrade		= upgrade;
 	cp.service_id		= srx->srx_service;
-	call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp);
+	call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp, debug_id);
 	/* The socket has been unlocked. */
 	if (!IS_ERR(call)) {
 		call->notify_rx = notify_rx;
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 416688381eb7..9c9817ddafc5 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -691,7 +691,6 @@ struct rxrpc_send_params {
  * af_rxrpc.c
  */
 extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs;
-extern atomic_t rxrpc_debug_id;
 extern struct workqueue_struct *rxrpc_workqueue;
 
 /*
@@ -732,11 +731,12 @@ extern unsigned int rxrpc_max_call_lifetime;
 extern struct kmem_cache *rxrpc_call_jar;
 
 struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long);
-struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t);
+struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int);
 struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
 					 struct rxrpc_conn_parameters *,
 					 struct sockaddr_rxrpc *,
-					 struct rxrpc_call_params *, gfp_t);
+					 struct rxrpc_call_params *, gfp_t,
+					 unsigned int);
 int rxrpc_retry_client_call(struct rxrpc_sock *,
 			    struct rxrpc_call *,
 			    struct rxrpc_conn_parameters *,
@@ -822,7 +822,7 @@ static inline bool __rxrpc_abort_call(const char *why, struct rxrpc_call *call,
 				      rxrpc_seq_t seq,
 				      u32 abort_code, int error)
 {
-	trace_rxrpc_abort(why, call->cid, call->call_id, seq,
+	trace_rxrpc_abort(call->debug_id, why, call->cid, call->call_id, seq,
 			  abort_code, error);
 	return __rxrpc_set_call_completion(call, RXRPC_CALL_LOCALLY_ABORTED,
 					   abort_code, error);
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 3028298ca561..92ebd1d7e0bb 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -34,7 +34,8 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 				      struct rxrpc_backlog *b,
 				      rxrpc_notify_rx_t notify_rx,
 				      rxrpc_user_attach_call_t user_attach_call,
-				      unsigned long user_call_ID, gfp_t gfp)
+				      unsigned long user_call_ID, gfp_t gfp,
+				      unsigned int debug_id)
 {
 	const void *here = __builtin_return_address(0);
 	struct rxrpc_call *call;
@@ -94,7 +95,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 	/* Now it gets complicated, because calls get registered with the
 	 * socket here, particularly if a user ID is preassigned by the user.
 	 */
-	call = rxrpc_alloc_call(rx, gfp);
+	call = rxrpc_alloc_call(rx, gfp, debug_id);
 	if (!call)
 		return -ENOMEM;
 	call->flags |= (1 << RXRPC_CALL_IS_SERVICE);
@@ -174,7 +175,8 @@ int rxrpc_service_prealloc(struct rxrpc_sock *rx, gfp_t gfp)
 	if (rx->discard_new_call)
 		return 0;
 
-	while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp) == 0)
+	while (rxrpc_service_prealloc_one(rx, b, NULL, NULL, 0, gfp,
+					  atomic_inc_return(&rxrpc_debug_id)) == 0)
 		;
 
 	return 0;
@@ -347,7 +349,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local,
 		   service_id == rx->second_service))
 		goto found_service;
 
-	trace_rxrpc_abort("INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+	trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RX_INVALID_OPERATION, EOPNOTSUPP);
 	skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
 	skb->priority = RX_INVALID_OPERATION;
@@ -358,7 +360,7 @@ found_service:
 	spin_lock(&rx->incoming_lock);
 	if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED ||
 	    rx->sk.sk_state == RXRPC_CLOSE) {
-		trace_rxrpc_abort("CLS", sp->hdr.cid, sp->hdr.callNumber,
+		trace_rxrpc_abort(0, "CLS", sp->hdr.cid, sp->hdr.callNumber,
 				  sp->hdr.seq, RX_INVALID_OPERATION, ESHUTDOWN);
 		skb->mark = RXRPC_SKB_MARK_LOCAL_ABORT;
 		skb->priority = RX_INVALID_OPERATION;
@@ -635,6 +637,7 @@ out_discard:
  * @user_attach_call: Func to attach call to user_call_ID
  * @user_call_ID: The tag to attach to the preallocated call
  * @gfp: The allocation conditions.
+ * @debug_id: The tracing debug ID.
  *
  * Charge up the socket with preallocated calls, each with a user ID.  A
  * function should be provided to effect the attachment from the user's side.
@@ -645,7 +648,8 @@ out_discard:
 int rxrpc_kernel_charge_accept(struct socket *sock,
 			       rxrpc_notify_rx_t notify_rx,
 			       rxrpc_user_attach_call_t user_attach_call,
-			       unsigned long user_call_ID, gfp_t gfp)
+			       unsigned long user_call_ID, gfp_t gfp,
+			       unsigned int debug_id)
 {
 	struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
 	struct rxrpc_backlog *b = rx->backlog;
@@ -655,6 +659,6 @@ int rxrpc_kernel_charge_accept(struct socket *sock,
 
 	return rxrpc_service_prealloc_one(rx, b, notify_rx,
 					  user_attach_call, user_call_ID,
-					  gfp);
+					  gfp, debug_id);
 }
 EXPORT_SYMBOL(rxrpc_kernel_charge_accept);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 0b2db38dd32d..147657dfe757 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -99,7 +99,8 @@ found_extant_call:
 /*
  * allocate a new call
  */
-struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp)
+struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
+				    unsigned int debug_id)
 {
 	struct rxrpc_call *call;
 
@@ -138,7 +139,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp)
 	spin_lock_init(&call->notify_lock);
 	rwlock_init(&call->state_lock);
 	atomic_set(&call->usage, 1);
-	call->debug_id = atomic_inc_return(&rxrpc_debug_id);
+	call->debug_id = debug_id;
 	call->tx_total_len = -1;
 	call->next_rx_timo = 20 * HZ;
 	call->next_req_timo = 1 * HZ;
@@ -166,14 +167,15 @@ nomem:
  */
 static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
 						  struct sockaddr_rxrpc *srx,
-						  gfp_t gfp)
+						  gfp_t gfp,
+						  unsigned int debug_id)
 {
 	struct rxrpc_call *call;
 	ktime_t now;
 
 	_enter("");
 
-	call = rxrpc_alloc_call(rx, gfp);
+	call = rxrpc_alloc_call(rx, gfp, debug_id);
 	if (!call)
 		return ERR_PTR(-ENOMEM);
 	call->state = RXRPC_CALL_CLIENT_AWAIT_CONN;
@@ -214,7 +216,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 					 struct rxrpc_conn_parameters *cp,
 					 struct sockaddr_rxrpc *srx,
 					 struct rxrpc_call_params *p,
-					 gfp_t gfp)
+					 gfp_t gfp,
+					 unsigned int debug_id)
 	__releases(&rx->sk.sk_lock.slock)
 {
 	struct rxrpc_call *call, *xcall;
@@ -225,7 +228,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 
 	_enter("%p,%lx", rx, p->user_call_ID);
 
-	call = rxrpc_alloc_client_call(rx, srx, gfp);
+	call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id);
 	if (IS_ERR(call)) {
 		release_sock(&rx->sk);
 		_leave(" = %ld", PTR_ERR(call));
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index b1dfae107431..d2ec3fd593e8 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -160,7 +160,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn,
 			lockdep_is_held(&conn->channel_lock));
 		if (call) {
 			if (compl == RXRPC_CALL_LOCALLY_ABORTED)
-				trace_rxrpc_abort("CON", call->cid,
+				trace_rxrpc_abort(call->debug_id,
+						  "CON", call->cid,
 						  call->call_id, 0,
 						  abort_code, error);
 			if (rxrpc_set_call_completion(call, compl,
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 6fc61400337f..2a868fdab0ae 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1307,21 +1307,21 @@ out_unlock:
 
 wrong_security:
 	rcu_read_unlock();
-	trace_rxrpc_abort("SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+	trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RXKADINCONSISTENCY, EBADMSG);
 	skb->priority = RXKADINCONSISTENCY;
 	goto post_abort;
 
 reupgrade:
 	rcu_read_unlock();
-	trace_rxrpc_abort("UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+	trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RX_PROTOCOL_ERROR, EBADMSG);
 	goto protocol_error;
 
 bad_message_unlock:
 	rcu_read_unlock();
 bad_message:
-	trace_rxrpc_abort("BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
+	trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq,
 			  RX_PROTOCOL_ERROR, EBADMSG);
 protocol_error:
 	skb->priority = RX_PROTOCOL_ERROR;
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 09f2a3e05221..8503f279b467 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -579,7 +579,8 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
 	cp.exclusive		= rx->exclusive | p->exclusive;
 	cp.upgrade		= p->upgrade;
 	cp.service_id		= srx->srx_service;
-	call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL);
+	call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL,
+				     atomic_inc_return(&rxrpc_debug_id));
 	/* The socket is now unlocked */
 
 	_leave(" = %p\n", call);
-- 
cgit v1.2.3


From 1bae5d229532b4e8dfd5728cb3b8373bc9eec9eb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 27 Mar 2018 23:08:20 +0100
Subject: rxrpc: Trace call completion

Add a tracepoint to track rxrpc calls moving into the completed state and
to log the completion type and the recorded error value and abort code.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 9c9817ddafc5..21cf164b6d85 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -778,6 +778,7 @@ static inline bool __rxrpc_set_call_completion(struct rxrpc_call *call,
 		call->error = error;
 		call->completion = compl,
 		call->state = RXRPC_CALL_COMPLETE;
+		trace_rxrpc_call_complete(call);
 		wake_up(&call->waitq);
 		return true;
 	}
-- 
cgit v1.2.3


From 6f5c39fa5cd4a78c5432021e981aa8f79437a32c Mon Sep 17 00:00:00 2001
From: "Nikita V. Shirokov" <tehnerd@fb.com>
Date: Mon, 26 Mar 2018 08:36:57 -0700
Subject: bpf: Add sock_ops R/W access to ipv4 tos

Sample usage for tos ...

  bpf_getsockopt(skops, SOL_IP, IP_TOS, &v, sizeof(v))

... where skops is a pointer to the ctx (struct bpf_sock_ops).

Signed-off-by: Nikita V. Shirokov <tehnerd@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 00c711c5f1a2..afd825534ac4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3462,6 +3462,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			ret = -EINVAL;
 		}
 #ifdef CONFIG_INET
+	} else if (level == SOL_IP) {
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+			return -EINVAL;
+
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case IP_TOS:
+			if (val < -1 || val > 0xff) {
+				ret = -EINVAL;
+			} else {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (val == -1)
+					val = 0;
+				inet->tos = val;
+			}
+			break;
+		default:
+			ret = -EINVAL;
+		}
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (level == SOL_IPV6) {
 		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
@@ -3561,6 +3582,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		} else {
 			goto err_clear;
 		}
+	} else if (level == SOL_IP) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+			goto err_clear;
+
+		/* Only some options are supported */
+		switch (optname) {
+		case IP_TOS:
+			*((int *)optval) = (int)inet->tos;
+			break;
+		default:
+			goto err_clear;
+		}
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (level == SOL_IPV6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);
-- 
cgit v1.2.3


From c105547501016897194358b11451608a8d5f9a02 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 28 Mar 2018 12:05:32 -0700
Subject: treewide: remove large struct-pass-by-value from tracepoint arguments

- fix trace_hfi1_ctxt_info() to pass large struct by reference instead of by value
- convert 'type array[]' tracepoint arguments into 'type *array',
  since compiler will warn that sizeof('type array[]') == sizeof('type *array')
  and later should be used instead

The CAST_TO_U64 macro in the later patch will enforce that tracepoint
arguments can only be integers, pointers, or less than 8 byte structures.
Larger structures should be passed by reference.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/wireless/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 5152938b358d..018c81fa72fb 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3137,7 +3137,7 @@ TRACE_EVENT(rdev_start_radar_detection,
 
 TRACE_EVENT(rdev_set_mcast_rate,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
-		 int mcast_rate[NUM_NL80211_BANDS]),
+		 int *mcast_rate),
 	TP_ARGS(wiphy, netdev, mcast_rate),
 	TP_STRUCT__entry(
 		WIPHY_ENTRY
-- 
cgit v1.2.3


From 14624a9387c4d80766d1fdd237b0d110a0a1b43d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 28 Mar 2018 12:05:34 -0700
Subject: net/mac802154: disambiguate mac80215 vs mac802154 trace events

two trace events defined with the same name and both unused.
They conflict in allyesconfig build. Rename one of them.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/mac802154/trace.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h
index 2c8a43d3607f..df855c33daf2 100644
--- a/net/mac802154/trace.h
+++ b/net/mac802154/trace.h
@@ -33,7 +33,7 @@
 
 /* Tracing for driver callbacks */
 
-DECLARE_EVENT_CLASS(local_only_evt,
+DECLARE_EVENT_CLASS(local_only_evt4,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local),
 	TP_STRUCT__entry(
@@ -45,7 +45,7 @@ DECLARE_EVENT_CLASS(local_only_evt,
 	TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG)
 );
 
-DEFINE_EVENT(local_only_evt, 802154_drv_return_void,
+DEFINE_EVENT(local_only_evt4, 802154_drv_return_void,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local)
 );
@@ -65,12 +65,12 @@ TRACE_EVENT(802154_drv_return_int,
 		  __entry->ret)
 );
 
-DEFINE_EVENT(local_only_evt, 802154_drv_start,
+DEFINE_EVENT(local_only_evt4, 802154_drv_start,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local)
 );
 
-DEFINE_EVENT(local_only_evt, 802154_drv_stop,
+DEFINE_EVENT(local_only_evt4, 802154_drv_stop,
 	TP_PROTO(struct ieee802154_local *local),
 	TP_ARGS(local)
 );
-- 
cgit v1.2.3


From 57566b20033f23bdc9f25d5fa4c36a7287aa08d2 Mon Sep 17 00:00:00 2001
From: "tamizhr@codeaurora.org" <tamizhr@codeaurora.org>
Date: Tue, 27 Mar 2018 19:16:16 +0530
Subject: mac80211: Use proper smps_mode enum in sta opmode event

SMPS_MODE change value notified via nl80211 contains mac80211
specific value(ieee80211_smps_mode) and user space application
will not know those values. This patch add support to map
the mac80211 enum value to nl80211_smps_mode which will be
understood by the userspace application.

Signed-off-by: Tamizh chelvam <tamizhr@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ht.c          | 15 +++++++++++++++
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/rx.c          |  3 ++-
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index d7523530d3f8..c78036a0ac94 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -466,6 +466,21 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata,
 		__ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_PEER_REQUEST);
 }
 
+enum nl80211_smps_mode
+ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps)
+{
+	switch (smps) {
+	case IEEE80211_SMPS_OFF:
+		return NL80211_SMPS_OFF;
+	case IEEE80211_SMPS_STATIC:
+		return NL80211_SMPS_STATIC;
+	case IEEE80211_SMPS_DYNAMIC:
+		return NL80211_SMPS_DYNAMIC;
+	default:
+		return NL80211_SMPS_OFF;
+	}
+}
+
 int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
 			       enum ieee80211_smps_mode smps, const u8 *da,
 			       const u8 *bssid)
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index ae9c33cd8ada..9237ffb4e986 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1788,6 +1788,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid);
 void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid);
 
 u8 ieee80211_mcs_to_chains(const struct ieee80211_mcs_info *mcs);
+enum nl80211_smps_mode
+ieee80211_smps_mode_to_smps_mode(enum ieee80211_smps_mode smps);
 
 /* VHT */
 void
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 27bb1f0b5e52..f8c69acfda48 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2883,7 +2883,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			if (rx->sta->sta.smps_mode == smps_mode)
 				goto handled;
 			rx->sta->sta.smps_mode = smps_mode;
-			sta_opmode.smps_mode = smps_mode;
+			sta_opmode.smps_mode =
+				ieee80211_smps_mode_to_smps_mode(smps_mode);
 			sta_opmode.changed = STA_OPMODE_SMPS_MODE_CHANGED;
 
 			sband = rx->local->hw.wiphy->bands[status->band];
-- 
cgit v1.2.3


From 97f5f4254a61d7a104b5e609a6a0d9a10c73d29e Mon Sep 17 00:00:00 2001
From: "tamizhr@codeaurora.org" <tamizhr@codeaurora.org>
Date: Tue, 27 Mar 2018 19:16:17 +0530
Subject: mac80211: Use proper chan_width enum in sta opmode event

Bandwidth change value reported via nl80211 contains mac80211
specific enum value(ieee80211_sta_rx_bw) and which is not
understand by userspace application. Map the mac80211 specific
value to nl80211_chan_width enum value to avoid using wrong value
in the userspace application. And used station's ht/vht capability
to map IEEE80211_STA_RX_BW_20 and IEEE80211_STA_RX_BW_160 with
proper nl80211 value.

Signed-off-by: Tamizh chelvam <tamizhr@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/rx.c          |  3 ++-
 net/mac80211/vht.c         | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 9237ffb4e986..6c341d89448c 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1816,6 +1816,8 @@ void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
 				      struct ieee80211_sta_vht_cap *vht_cap);
 void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
 				     u16 vht_mask[NL80211_VHT_NSS_MAX]);
+enum nl80211_chan_width
+ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta);
 
 /* Spectrum management */
 void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index f8c69acfda48..3a9f0c0a2de8 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2922,7 +2922,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 
 			rx->sta->sta.bandwidth = new_bw;
 			sband = rx->local->hw.wiphy->bands[status->band];
-			sta_opmode.bw = new_bw;
+			sta_opmode.bw =
+				ieee80211_sta_rx_bw_to_chan_width(rx->sta);
 			sta_opmode.changed = STA_OPMODE_MAX_BW_CHANGED;
 
 			rate_control_rate_update(local, sband, rx->sta,
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 5714dee76b12..259325cbcc31 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -358,6 +358,36 @@ enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta)
 	return NL80211_CHAN_WIDTH_80;
 }
 
+enum nl80211_chan_width
+ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta)
+{
+	enum ieee80211_sta_rx_bandwidth cur_bw = sta->sta.bandwidth;
+	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
+	u32 cap_width;
+
+	switch (cur_bw) {
+	case IEEE80211_STA_RX_BW_20:
+		if (!sta->sta.ht_cap.ht_supported)
+			return NL80211_CHAN_WIDTH_20_NOHT;
+		else
+			return NL80211_CHAN_WIDTH_20;
+	case IEEE80211_STA_RX_BW_40:
+		return NL80211_CHAN_WIDTH_40;
+	case IEEE80211_STA_RX_BW_80:
+		return NL80211_CHAN_WIDTH_80;
+	case IEEE80211_STA_RX_BW_160:
+		cap_width =
+			vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+
+		if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ)
+			return NL80211_CHAN_WIDTH_160;
+
+		return NL80211_CHAN_WIDTH_80P80;
+	default:
+		return NL80211_CHAN_WIDTH_20;
+	}
+}
+
 enum ieee80211_sta_rx_bandwidth
 ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width)
 {
@@ -484,7 +514,7 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
 	new_bw = ieee80211_sta_cur_vht_bw(sta);
 	if (new_bw != sta->sta.bandwidth) {
 		sta->sta.bandwidth = new_bw;
-		sta_opmode.bw = new_bw;
+		sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta);
 		changed |= IEEE80211_RC_BW_CHANGED;
 		sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED;
 	}
-- 
cgit v1.2.3


From 850964519a65a96aa8231ff31e28f30c2633484d Mon Sep 17 00:00:00 2001
From: Dmitry Lebed <dlebed@quantenna.com>
Date: Mon, 26 Mar 2018 16:36:31 +0300
Subject: cfg80211: fix CAC_STARTED event handling

Exclude CAC_STARTED event from !wdev->cac_started check,
since cac_started will be set later in the same function.

Signed-off-by: Dmitry Lebed <dlebed@quantenna.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/mlme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 6b6818dd76bd..12b3edf70a7b 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -872,7 +872,7 @@ void cfg80211_cac_event(struct net_device *netdev,
 
 	trace_cfg80211_cac_event(netdev, event);
 
-	if (WARN_ON(!wdev->cac_started))
+	if (WARN_ON(!wdev->cac_started && event != NL80211_RADAR_CAC_STARTED))
 		return;
 
 	if (WARN_ON(!wdev->chandef.chan))
-- 
cgit v1.2.3


From 2c390e44e46675da0e7bba5c3b921a091a60ec2c Mon Sep 17 00:00:00 2001
From: Dmitry Lebed <dlebed@quantenna.com>
Date: Mon, 26 Mar 2018 16:36:32 +0300
Subject: cfg80211: enable use of non-cleared DFS channels for DFS offload

Currently channel switch/start_ap to DFS channel cannot be done to
non-CAC-cleared channel even if DFS offload if enabled.
Make non-cleared DFS channels available if DFS offload is enabled.
CAC will be started by HW after channel change, start_ap call, etc.

Signed-off-by: Dmitry Lebed <dlebed@quantenna.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/chan.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index a48859982a32..2db713d18f71 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -579,6 +579,10 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy,
 {
 	struct ieee80211_channel *c;
 	u32 freq, start_freq, end_freq;
+	bool dfs_offload;
+
+	dfs_offload = wiphy_ext_feature_isset(wiphy,
+					      NL80211_EXT_FEATURE_DFS_OFFLOAD);
 
 	start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
 	end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
@@ -596,8 +600,9 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy,
 		if (c->flags & IEEE80211_CHAN_DISABLED)
 			return false;
 
-		if ((c->flags & IEEE80211_CHAN_RADAR)  &&
-		    (c->dfs_state != NL80211_DFS_AVAILABLE))
+		if ((c->flags & IEEE80211_CHAN_RADAR) &&
+		    (c->dfs_state != NL80211_DFS_AVAILABLE) &&
+		    !(c->dfs_state == NL80211_DFS_USABLE && dfs_offload))
 			return false;
 	}
 
-- 
cgit v1.2.3


From 37b1c004685a3cea420dd96aa3803da627359f60 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:44 -0500
Subject: cfg80211: Support all iftypes in autodisconnect_wk

Currently autodisconnect_wk assumes that only interface types of
P2P_CLIENT and STATION use conn_owner_nlportid.  Change this so all
interface types are supported.

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/sme.c | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 701cfd7acc1b..5df6b33db786 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1239,17 +1239,38 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
 	wdev_lock(wdev);
 
 	if (wdev->conn_owner_nlportid) {
-		/*
-		 * Use disconnect_bssid if still connecting and ops->disconnect
-		 * not implemented.  Otherwise we can use cfg80211_disconnect.
-		 */
-		if (rdev->ops->disconnect || wdev->current_bss)
-			cfg80211_disconnect(rdev, wdev->netdev,
-					    WLAN_REASON_DEAUTH_LEAVING, true);
-		else
-			cfg80211_mlme_deauth(rdev, wdev->netdev,
-					     wdev->disconnect_bssid, NULL, 0,
-					     WLAN_REASON_DEAUTH_LEAVING, false);
+		switch (wdev->iftype) {
+		case NL80211_IFTYPE_ADHOC:
+			cfg80211_leave_ibss(rdev, wdev->netdev, false);
+			break;
+		case NL80211_IFTYPE_AP:
+		case NL80211_IFTYPE_P2P_GO:
+			cfg80211_stop_ap(rdev, wdev->netdev, false);
+			break;
+		case NL80211_IFTYPE_MESH_POINT:
+			cfg80211_leave_mesh(rdev, wdev->netdev);
+			break;
+		case NL80211_IFTYPE_STATION:
+		case NL80211_IFTYPE_P2P_CLIENT:
+			/*
+			 * Use disconnect_bssid if still connecting and
+			 * ops->disconnect not implemented.  Otherwise we can
+			 * use cfg80211_disconnect.
+			 */
+			if (rdev->ops->disconnect || wdev->current_bss)
+				cfg80211_disconnect(rdev, wdev->netdev,
+						    WLAN_REASON_DEAUTH_LEAVING,
+						    true);
+			else
+				cfg80211_mlme_deauth(rdev, wdev->netdev,
+						     wdev->disconnect_bssid,
+						     NULL, 0,
+						     WLAN_REASON_DEAUTH_LEAVING,
+						     false);
+			break;
+		default:
+			break;
+		}
 	}
 
 	wdev_unlock(wdev);
-- 
cgit v1.2.3


From f8d16d3edb4dbae080df04318423c360de3c594d Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:45 -0500
Subject: nl80211: Add SOCKET_OWNER support to JOIN_IBSS

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
[johannes: fix race with wdev lock/unlock by just acquiring once]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.h    |  8 ++++----
 net/wireless/ibss.c    | 27 ++++++---------------------
 net/wireless/nl80211.c |  7 ++++++-
 3 files changed, 16 insertions(+), 26 deletions(-)

(limited to 'net')

diff --git a/net/wireless/core.h b/net/wireless/core.h
index eaff636169c2..b5cf3ea8d4df 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -282,10 +282,10 @@ void cfg80211_bss_age(struct cfg80211_registered_device *rdev,
                       unsigned long age_secs);
 
 /* IBSS */
-int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev,
-		       struct cfg80211_ibss_params *params,
-		       struct cfg80211_cached_keys *connkeys);
+int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 struct cfg80211_ibss_params *params,
+			 struct cfg80211_cached_keys *connkeys);
 void cfg80211_clear_ibss(struct net_device *dev, bool nowext);
 int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
 			  struct net_device *dev, bool nowext);
diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c
index a1d10993d08a..d1743e6abc34 100644
--- a/net/wireless/ibss.c
+++ b/net/wireless/ibss.c
@@ -84,14 +84,15 @@ void cfg80211_ibss_joined(struct net_device *dev, const u8 *bssid,
 }
 EXPORT_SYMBOL(cfg80211_ibss_joined);
 
-static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
-				struct net_device *dev,
-				struct cfg80211_ibss_params *params,
-				struct cfg80211_cached_keys *connkeys)
+int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
+			 struct net_device *dev,
+			 struct cfg80211_ibss_params *params,
+			 struct cfg80211_cached_keys *connkeys)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int err;
 
+	ASSERT_RTNL();
 	ASSERT_WDEV_LOCK(wdev);
 
 	if (wdev->ssid_len)
@@ -146,23 +147,6 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
 	return 0;
 }
 
-int cfg80211_join_ibss(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev,
-		       struct cfg80211_ibss_params *params,
-		       struct cfg80211_cached_keys *connkeys)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
-
-	ASSERT_RTNL();
-
-	wdev_lock(wdev);
-	err = __cfg80211_join_ibss(rdev, dev, params, connkeys);
-	wdev_unlock(wdev);
-
-	return err;
-}
-
 static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
@@ -224,6 +208,7 @@ int __cfg80211_leave_ibss(struct cfg80211_registered_device *rdev,
 	if (err)
 		return err;
 
+	wdev->conn_owner_nlportid = 0;
 	__cfg80211_clear_ibss(dev, nowext);
 
 	return 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index fe27ab443d01..13f7c002f562 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8679,9 +8679,14 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	ibss.userspace_handles_dfs =
 		nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);
 
-	err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
+	wdev_lock(dev->ieee80211_ptr);
+	err = __cfg80211_join_ibss(rdev, dev, &ibss, connkeys);
 	if (err)
 		kzfree(connkeys);
+	else if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
+		dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
+	wdev_unlock(dev->ieee80211_ptr);
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 188c1b3c04d69e842122daf201f07a34fcfad039 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:46 -0500
Subject: nl80211: Add SOCKET_OWNER support to JOIN_MESH

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
[johannes: fix race with wdev lock/unlock by just acquiring once]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/core.h    |  4 ----
 net/wireless/mesh.c    | 16 +---------------
 net/wireless/nl80211.c | 10 ++++++++--
 3 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/wireless/core.h b/net/wireless/core.h
index b5cf3ea8d4df..63eb1b5fdd04 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -303,10 +303,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 			 struct net_device *dev,
 			 struct mesh_setup *setup,
 			 const struct mesh_config *conf);
-int cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev,
-		       struct mesh_setup *setup,
-		       const struct mesh_config *conf);
 int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
 			  struct net_device *dev);
 int cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c
index b12da6ef3c12..eac5aa1419fc 100644
--- a/net/wireless/mesh.c
+++ b/net/wireless/mesh.c
@@ -217,21 +217,6 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
 	return err;
 }
 
-int cfg80211_join_mesh(struct cfg80211_registered_device *rdev,
-		       struct net_device *dev,
-		       struct mesh_setup *setup,
-		       const struct mesh_config *conf)
-{
-	struct wireless_dev *wdev = dev->ieee80211_ptr;
-	int err;
-
-	wdev_lock(wdev);
-	err = __cfg80211_join_mesh(rdev, dev, setup, conf);
-	wdev_unlock(wdev);
-
-	return err;
-}
-
 int cfg80211_set_mesh_channel(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev,
 			      struct cfg80211_chan_def *chandef)
@@ -286,6 +271,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev,
 
 	err = rdev_leave_mesh(rdev, dev);
 	if (!err) {
+		wdev->conn_owner_nlportid = 0;
 		wdev->mesh_id_len = 0;
 		wdev->beacon_interval = 0;
 		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 13f7c002f562..1d6e81e5b2c8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10092,7 +10092,7 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 		if (err)
 			return err;
 	} else {
-		/* cfg80211_join_mesh() will sort it out */
+		/* __cfg80211_join_mesh() will sort it out */
 		setup.chandef.chan = NULL;
 	}
 
@@ -10130,7 +10130,13 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 	setup.userspace_handles_dfs =
 		nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);
 
-	return cfg80211_join_mesh(rdev, dev, &setup, &cfg);
+	wdev_lock(dev->ieee80211_ptr);
+	err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg);
+	if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER])
+		dev->ieee80211_ptr->conn_owner_nlportid = info->snd_portid;
+	wdev_unlock(dev->ieee80211_ptr);
+
+	return err;
 }
 
 static int nl80211_leave_mesh(struct sk_buff *skb, struct genl_info *info)
-- 
cgit v1.2.3


From 466a306142c002b40deaa58da94741af4153d1c4 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:47 -0500
Subject: nl80211: Add SOCKET_OWNER support to START_AP

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/ap.c      | 1 +
 net/wireless/nl80211.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'net')

diff --git a/net/wireless/ap.c b/net/wireless/ap.c
index 63682176c96c..882d97bdc6bf 100644
--- a/net/wireless/ap.c
+++ b/net/wireless/ap.c
@@ -27,6 +27,7 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev,
 
 	err = rdev_stop_ap(rdev, dev);
 	if (!err) {
+		wdev->conn_owner_nlportid = 0;
 		wdev->beacon_interval = 0;
 		memset(&wdev->chandef, 0, sizeof(wdev->chandef));
 		wdev->ssid_len = 0;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1d6e81e5b2c8..cfaf2aeb9783 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4134,6 +4134,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 		wdev->chandef = params.chandef;
 		wdev->ssid_len = params.ssid_len;
 		memcpy(wdev->ssid, params.ssid, wdev->ssid_len);
+
+		if (info->attrs[NL80211_ATTR_SOCKET_OWNER])
+			wdev->conn_owner_nlportid = info->snd_portid;
 	}
 	wdev_unlock(wdev);
 
-- 
cgit v1.2.3


From 230ebaa189af44d50dccb4a1846e39ca594e347b Mon Sep 17 00:00:00 2001
From: Haim Dreyfuss <haim.dreyfuss@intel.com>
Date: Wed, 28 Mar 2018 13:24:09 +0300
Subject: cfg80211: read wmm rules from regulatory database

ETSI EN 301 893 v2.1.1 (2017-05) standard defines a new channel access
mechanism that all devices (WLAN and LAA) need to comply with.
The regulatory database can now be loaded into the kernel and also
has the option to load optional data.
In order to be able to comply with ETSI standard, we add wmm_rule into
regulatory rule and add the option to read its value from the regulatory
database.

Signed-off-by: Haim Dreyfuss <haim.dreyfuss@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
[johannes: fix memory leak in error path]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 141 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 7b42f0bacfd8..eddc834f6358 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,6 +5,7 @@
  * Copyright 2008-2011	Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright      2017  Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * Permission to use, copy, modify, and/or distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -424,23 +425,36 @@ static const struct ieee80211_regdomain *
 reg_copy_regd(const struct ieee80211_regdomain *src_regd)
 {
 	struct ieee80211_regdomain *regd;
-	int size_of_regd;
+	int size_of_regd, size_of_wmms;
 	unsigned int i;
+	struct ieee80211_wmm_rule *d_wmm, *s_wmm;
 
 	size_of_regd =
 		sizeof(struct ieee80211_regdomain) +
 		src_regd->n_reg_rules * sizeof(struct ieee80211_reg_rule);
+	size_of_wmms = src_regd->n_wmm_rules *
+		sizeof(struct ieee80211_wmm_rule);
 
-	regd = kzalloc(size_of_regd, GFP_KERNEL);
+	regd = kzalloc(size_of_regd + size_of_wmms, GFP_KERNEL);
 	if (!regd)
 		return ERR_PTR(-ENOMEM);
 
 	memcpy(regd, src_regd, sizeof(struct ieee80211_regdomain));
 
-	for (i = 0; i < src_regd->n_reg_rules; i++)
+	d_wmm = (struct ieee80211_wmm_rule *)((u8 *)regd + size_of_regd);
+	s_wmm = (struct ieee80211_wmm_rule *)((u8 *)src_regd + size_of_regd);
+	memcpy(d_wmm, s_wmm, size_of_wmms);
+
+	for (i = 0; i < src_regd->n_reg_rules; i++) {
 		memcpy(&regd->reg_rules[i], &src_regd->reg_rules[i],
 		       sizeof(struct ieee80211_reg_rule));
+		if (!src_regd->reg_rules[i].wmm_rule)
+			continue;
 
+		regd->reg_rules[i].wmm_rule = d_wmm +
+			(src_regd->reg_rules[i].wmm_rule - s_wmm) /
+			sizeof(struct ieee80211_wmm_rule);
+	}
 	return regd;
 }
 
@@ -595,6 +609,17 @@ enum fwdb_flags {
 	FWDB_FLAG_AUTO_BW	= BIT(4),
 };
 
+struct fwdb_wmm_ac {
+	u8 ecw;
+	u8 aifsn;
+	__be16 cot;
+} __packed;
+
+struct fwdb_wmm_rule {
+	struct fwdb_wmm_ac client[IEEE80211_NUM_ACS];
+	struct fwdb_wmm_ac ap[IEEE80211_NUM_ACS];
+} __packed;
+
 struct fwdb_rule {
 	u8 len;
 	u8 flags;
@@ -602,6 +627,7 @@ struct fwdb_rule {
 	__be32 start, end, max_bw;
 	/* start of optional data */
 	__be16 cac_timeout;
+	__be16 wmm_ptr;
 } __packed __aligned(4);
 
 #define FWDB_MAGIC 0x52474442
@@ -613,6 +639,31 @@ struct fwdb_header {
 	struct fwdb_country country[];
 } __packed __aligned(4);
 
+static int ecw2cw(int ecw)
+{
+	return (1 << ecw) - 1;
+}
+
+static bool valid_wmm(struct fwdb_wmm_rule *rule)
+{
+	struct fwdb_wmm_ac *ac = (struct fwdb_wmm_ac *)rule;
+	int i;
+
+	for (i = 0; i < IEEE80211_NUM_ACS * 2; i++) {
+		u16 cw_min = ecw2cw((ac[i].ecw & 0xf0) >> 4);
+		u16 cw_max = ecw2cw(ac[i].ecw & 0x0f);
+		u8 aifsn = ac[i].aifsn;
+
+		if (cw_min >= cw_max)
+			return false;
+
+		if (aifsn < 1)
+			return false;
+	}
+
+	return true;
+}
+
 static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr)
 {
 	struct fwdb_rule *rule = (void *)(data + (rule_ptr << 2));
@@ -623,7 +674,18 @@ static bool valid_rule(const u8 *data, unsigned int size, u16 rule_ptr)
 	/* mandatory fields */
 	if (rule->len < offsetofend(struct fwdb_rule, max_bw))
 		return false;
+	if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) {
+		u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2;
+		struct fwdb_wmm_rule *wmm;
 
+		if (wmm_ptr + sizeof(struct fwdb_wmm_rule) > size)
+			return false;
+
+		wmm = (void *)(data + wmm_ptr);
+
+		if (!valid_wmm(wmm))
+			return false;
+	}
 	return true;
 }
 
@@ -798,23 +860,64 @@ static bool valid_regdb(const u8 *data, unsigned int size)
 	return true;
 }
 
+static void set_wmm_rule(struct ieee80211_wmm_rule *rule,
+			 struct fwdb_wmm_rule *wmm)
+{
+	unsigned int i;
+
+	for (i = 0; i < IEEE80211_NUM_ACS; i++) {
+		rule->client[i].cw_min =
+			ecw2cw((wmm->client[i].ecw & 0xf0) >> 4);
+		rule->client[i].cw_max = ecw2cw(wmm->client[i].ecw & 0x0f);
+		rule->client[i].aifsn =  wmm->client[i].aifsn;
+		rule->client[i].cot = 1000 * be16_to_cpu(wmm->client[i].cot);
+		rule->ap[i].cw_min = ecw2cw((wmm->ap[i].ecw & 0xf0) >> 4);
+		rule->ap[i].cw_max = ecw2cw(wmm->ap[i].ecw & 0x0f);
+		rule->ap[i].aifsn = wmm->ap[i].aifsn;
+		rule->ap[i].cot = 1000 * be16_to_cpu(wmm->ap[i].cot);
+	}
+}
+
+struct wmm_ptrs {
+	struct ieee80211_wmm_rule *rule;
+	u32 ptr;
+};
+
+static struct ieee80211_wmm_rule *find_wmm_ptr(struct wmm_ptrs *wmm_ptrs,
+					       u32 wmm_ptr, int n_wmms)
+{
+	int i;
+
+	for (i = 0; i < n_wmms; i++) {
+		if (wmm_ptrs[i].ptr == wmm_ptr)
+			return wmm_ptrs[i].rule;
+	}
+	return NULL;
+}
+
 static int regdb_query_country(const struct fwdb_header *db,
 			       const struct fwdb_country *country)
 {
 	unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
 	struct fwdb_collection *coll = (void *)((u8 *)db + ptr);
 	struct ieee80211_regdomain *regdom;
-	unsigned int size_of_regd;
-	unsigned int i;
+	struct ieee80211_regdomain *tmp_rd;
+	unsigned int size_of_regd, i, n_wmms = 0;
+	struct wmm_ptrs *wmm_ptrs;
 
-	size_of_regd =
-		sizeof(struct ieee80211_regdomain) +
+	size_of_regd = sizeof(struct ieee80211_regdomain) +
 		coll->n_rules * sizeof(struct ieee80211_reg_rule);
 
 	regdom = kzalloc(size_of_regd, GFP_KERNEL);
 	if (!regdom)
 		return -ENOMEM;
 
+	wmm_ptrs = kcalloc(coll->n_rules, sizeof(*wmm_ptrs), GFP_KERNEL);
+	if (!wmm_ptrs) {
+		kfree(regdom);
+		return -ENOMEM;
+	}
+
 	regdom->n_reg_rules = coll->n_rules;
 	regdom->alpha2[0] = country->alpha2[0];
 	regdom->alpha2[1] = country->alpha2[1];
@@ -851,7 +954,38 @@ static int regdb_query_country(const struct fwdb_header *db,
 		if (rule->len >= offsetofend(struct fwdb_rule, cac_timeout))
 			rrule->dfs_cac_ms =
 				1000 * be16_to_cpu(rule->cac_timeout);
+		if (rule->len >= offsetofend(struct fwdb_rule, wmm_ptr)) {
+			u32 wmm_ptr = be16_to_cpu(rule->wmm_ptr) << 2;
+			struct ieee80211_wmm_rule *wmm_pos =
+				find_wmm_ptr(wmm_ptrs, wmm_ptr, n_wmms);
+			struct fwdb_wmm_rule *wmm;
+			struct ieee80211_wmm_rule *wmm_rule;
+
+			if (wmm_pos) {
+				rrule->wmm_rule = wmm_pos;
+				continue;
+			}
+			wmm = (void *)((u8 *)db + wmm_ptr);
+			tmp_rd = krealloc(regdom, size_of_regd + (n_wmms + 1) *
+					  sizeof(struct ieee80211_wmm_rule),
+					  GFP_KERNEL);
+
+			if (!tmp_rd) {
+				kfree(regdom);
+				return -ENOMEM;
+			}
+			regdom = tmp_rd;
+
+			wmm_rule = (struct ieee80211_wmm_rule *)
+				((u8 *)regdom + size_of_regd + n_wmms *
+				sizeof(struct ieee80211_wmm_rule));
+
+			set_wmm_rule(wmm_rule, wmm);
+			wmm_ptrs[n_wmms].ptr = wmm_ptr;
+			wmm_ptrs[n_wmms++].rule = wmm_rule;
+		}
 	}
+	kfree(wmm_ptrs);
 
 	return reg_schedule_apply(regdom);
 }
-- 
cgit v1.2.3


From 5bf16a11ba2940c67d0b3a2154813bb42e8c6281 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 27 Feb 2018 11:22:15 +0100
Subject: cfg80211: don't require RTNL held for regdomain reads

The whole code is set up to allow RCU reads of this data, but
then uses rtnl_dereference() which requires the RTNL. Convert
it to rcu_dereference_rtnl() which makes it require only RCU
or the RTNL, to allow RCU-protected reading of the data.

Reviewed-by: Coelho, Luciano <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index eddc834f6358..e352a0d1c438 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -135,12 +135,12 @@ static void restore_regulatory_settings(bool reset_user);
 
 static const struct ieee80211_regdomain *get_cfg80211_regdom(void)
 {
-	return rtnl_dereference(cfg80211_regdomain);
+	return rcu_dereference_rtnl(cfg80211_regdomain);
 }
 
 const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy)
 {
-	return rtnl_dereference(wiphy->regd);
+	return rcu_dereference_rtnl(wiphy->regd);
 }
 
 static const char *reg_dfs_region_str(enum nl80211_dfs_regions dfs_region)
-- 
cgit v1.2.3


From e552af058148498c8a0874edf6b022caea9cb2b7 Mon Sep 17 00:00:00 2001
From: Haim Dreyfuss <haim.dreyfuss@intel.com>
Date: Wed, 28 Mar 2018 13:24:10 +0300
Subject: mac80211: limit wmm params to comply with ETSI requirements

ETSI has recently added new requirements that restrict the WMM
parameter values for 5GHz frequencies.  We need to take care of the
following scenarios in order to comply with these new requirements:

1. When using mac80211 default values;
2. When the userspace tries to configure its own values;
3. When associating to an AP which advertises WWM IE.

When associating to an AP, the client uses the values in the
advertised WMM IE.  But the AP may not comply with the new ETSI
requirements, so the client needs to check the current regulatory
rules and use those limits accordingly.

Signed-off-by: Haim Dreyfuss <haim.dreyfuss@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c         |  3 +++
 net/mac80211/ieee80211_i.h |  4 ++++
 net/mac80211/mlme.c        |  1 +
 net/mac80211/util.c        | 44 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5c4b105ca398..36d128ebbac8 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -4,6 +4,7 @@
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2015  Intel Mobile Communications GmbH
  * Copyright (C) 2015-2017 Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * This file is GPLv2 as found in COPYING.
  */
@@ -2156,6 +2157,8 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
 	 */
 	p.uapsd = false;
 
+	ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac);
+
 	sdata->tx_conf[params->ac] = p;
 	if (drv_conf_tx(local, sdata, params->ac, &p)) {
 		wiphy_debug(local->hw.wiphy,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6c341d89448c..5b3d78362fb5 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -4,6 +4,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2007-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2015  Intel Mobile Communications GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1869,6 +1870,9 @@ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */
 int ieee80211_frame_duration(enum nl80211_band band, size_t len,
 			     int rate, int erp, int short_preamble,
 			     int shift);
+void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
+					   struct ieee80211_tx_queue_params *qparam,
+					   int ac);
 void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 			       bool bss_notify, bool enable_qos);
 void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ea3ba03fb952..a3bcf953d423 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1792,6 +1792,7 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
 				   params[ac].cw_min, params[ac].cw_max, aci);
 			return false;
 		}
+		ieee80211_regulatory_limit_wmm_params(sdata, &params[ac], ac);
 	}
 
 	for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 55cd2922627a..11f9cfc016d9 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -5,6 +5,7 @@
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright (C) 2015-2017	Intel Deutschland GmbH
+ * Copyright (C) 2018 Intel Corporation
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -1113,6 +1114,48 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action,
 	return crc;
 }
 
+void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata,
+					   struct ieee80211_tx_queue_params
+					   *qparam, int ac)
+{
+	struct ieee80211_chanctx_conf *chanctx_conf;
+	const struct ieee80211_reg_rule *rrule;
+	struct ieee80211_wmm_ac *wmm_ac;
+	u16 center_freq = 0;
+
+	if (sdata->vif.type != NL80211_IFTYPE_AP &&
+	    sdata->vif.type != NL80211_IFTYPE_STATION)
+		return;
+
+	rcu_read_lock();
+	chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+	if (chanctx_conf)
+		center_freq = chanctx_conf->def.chan->center_freq;
+
+	if (!center_freq) {
+		rcu_read_unlock();
+		return;
+	}
+
+	rrule = freq_reg_info(sdata->wdev.wiphy, MHZ_TO_KHZ(center_freq));
+
+	if (IS_ERR_OR_NULL(rrule) || !rrule->wmm_rule) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (sdata->vif.type == NL80211_IFTYPE_AP)
+		wmm_ac = &rrule->wmm_rule->ap[ac];
+	else
+		wmm_ac = &rrule->wmm_rule->client[ac];
+	qparam->cw_min = max_t(u16, qparam->cw_min, wmm_ac->cw_min);
+	qparam->cw_max = max_t(u16, qparam->cw_max, wmm_ac->cw_max);
+	qparam->aifs = max_t(u8, qparam->aifs, wmm_ac->aifsn);
+	qparam->txop = !qparam->txop ? wmm_ac->cot / 32 :
+		min_t(u16, qparam->txop, wmm_ac->cot / 32);
+	rcu_read_unlock();
+}
+
 void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 			       bool bss_notify, bool enable_qos)
 {
@@ -1206,6 +1249,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata,
 				break;
 			}
 		}
+		ieee80211_regulatory_limit_wmm_params(sdata, &qparam, ac);
 
 		qparam.uapsd = false;
 
-- 
cgit v1.2.3


From 19d3577e35e0cbb42694811b096e749a0f89a824 Mon Sep 17 00:00:00 2001
From: Haim Dreyfuss <haim.dreyfuss@intel.com>
Date: Wed, 28 Mar 2018 13:24:11 +0300
Subject: cfg80211: Add API to allow querying regdb for wmm_rule

In general regulatory self managed devices maintain their own
regulatory profiles thus it doesn't have to query the regulatory database
on country change.

ETSI has recently introduced a new channel access mechanism for 5GHz
that all wlan devices need to comply with.
These values are stored in the regulatory database.
There are self managed devices which can't maintain these
values on their own. Add API to allow self managed regulatory devices
to query the regulatory database for high band wmm rule.

Signed-off-by: Haim Dreyfuss <haim.dreyfuss@intel.com>
Signed-off-by: Luca Coelho <luciano.coelho@intel.com>
[johannes: fix documentation]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/reg.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'net')

diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index e352a0d1c438..16c7e4ef5820 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -878,6 +878,60 @@ static void set_wmm_rule(struct ieee80211_wmm_rule *rule,
 	}
 }
 
+static int __regdb_query_wmm(const struct fwdb_header *db,
+			     const struct fwdb_country *country, int freq,
+			     u32 *dbptr, struct ieee80211_wmm_rule *rule)
+{
+	unsigned int ptr = be16_to_cpu(country->coll_ptr) << 2;
+	struct fwdb_collection *coll = (void *)((u8 *)db + ptr);
+	int i;
+
+	for (i = 0; i < coll->n_rules; i++) {
+		__be16 *rules_ptr = (void *)((u8 *)coll + ALIGN(coll->len, 2));
+		unsigned int rule_ptr = be16_to_cpu(rules_ptr[i]) << 2;
+		struct fwdb_rule *rrule = (void *)((u8 *)db + rule_ptr);
+		struct fwdb_wmm_rule *wmm;
+		unsigned int wmm_ptr;
+
+		if (rrule->len < offsetofend(struct fwdb_rule, wmm_ptr))
+			continue;
+
+		if (freq >= KHZ_TO_MHZ(be32_to_cpu(rrule->start)) &&
+		    freq <= KHZ_TO_MHZ(be32_to_cpu(rrule->end))) {
+			wmm_ptr = be16_to_cpu(rrule->wmm_ptr) << 2;
+			wmm = (void *)((u8 *)db + wmm_ptr);
+			set_wmm_rule(rule, wmm);
+			if (dbptr)
+				*dbptr = wmm_ptr;
+			return 0;
+		}
+	}
+
+	return -ENODATA;
+}
+
+int reg_query_regdb_wmm(char *alpha2, int freq, u32 *dbptr,
+			struct ieee80211_wmm_rule *rule)
+{
+	const struct fwdb_header *hdr = regdb;
+	const struct fwdb_country *country;
+
+	if (IS_ERR(regdb))
+		return PTR_ERR(regdb);
+
+	country = &hdr->country[0];
+	while (country->coll_ptr) {
+		if (alpha2_equal(alpha2, country->alpha2))
+			return __regdb_query_wmm(regdb, country, freq, dbptr,
+						 rule);
+
+		country++;
+	}
+
+	return -ENODATA;
+}
+EXPORT_SYMBOL(reg_query_regdb_wmm);
+
 struct wmm_ptrs {
 	struct ieee80211_wmm_rule *rule;
 	u32 ptr;
-- 
cgit v1.2.3


From db3bdcb9c3ffc628c5284d7ed03a704295ba1214 Mon Sep 17 00:00:00 2001
From: Manikanta Pubbisetty <mpubbise@codeaurora.org>
Date: Wed, 28 Mar 2018 18:34:19 +0530
Subject: mac80211: allow AP_VLAN operation on crypto controlled devices

In the current implementation, mac80211 advertises the support of
AP_VLANs based on the driver's support for AP mode; it also
blocks encrypted AP_VLAN operation on devices advertising
SW_CRYPTO_CONTROL.

The implementation seems weird in it's current form and could be
often confusing, this is because there can be drivers advertising
both SW_CRYPTO_CONTROL and AP mode support (ex: ath10k) in which case
AP_VLAN will still be supported but only in open BSS and not in
secured BSS.

When SW_CRYPTO_CONTROL is enabled, it makes more sense if the decision
to support AP_VLANs is left to the driver. Mac80211 can then allow
AP_VLAN operations depending on the driver support.

Signed-off-by: Manikanta Pubbisetty <mpubbise@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/key.c  | 8 +++++---
 net/mac80211/main.c | 8 ++++++--
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index aee05ec3f7ea..ee0d0cc8dc3b 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -126,7 +126,7 @@ static void decrease_tailroom_need_count(struct ieee80211_sub_if_data *sdata,
 
 static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 {
-	struct ieee80211_sub_if_data *sdata;
+	struct ieee80211_sub_if_data *sdata = key->sdata;
 	struct sta_info *sta;
 	int ret = -EOPNOTSUPP;
 
@@ -162,7 +162,6 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 	if (sta && !sta->uploaded)
 		goto out_unsupported;
 
-	sdata = key->sdata;
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) {
 		/*
 		 * The driver doesn't know anything about VLAN interfaces.
@@ -214,8 +213,11 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 		/* all of these we can do in software - if driver can */
 		if (ret == 1)
 			return 0;
-		if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL))
+		if (ieee80211_hw_check(&key->local->hw, SW_CRYPTO_CONTROL)) {
+			if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+				return 0;
 			return -EINVAL;
+		}
 		return 0;
 	default:
 		return -EINVAL;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 0785d04a80bc..8d0333b5355b 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -930,8 +930,12 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 			             IEEE80211_HT_CAP_SM_PS_SHIFT;
 	}
 
-	/* if low-level driver supports AP, we also support VLAN */
-	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP)) {
+	/* if low-level driver supports AP, we also support VLAN.
+	 * drivers advertising SW_CRYPTO_CONTROL should enable AP_VLAN
+	 * based on their support to transmit SW encrypted packets.
+	 */
+	if (local->hw.wiphy->interface_modes & BIT(NL80211_IFTYPE_AP) &&
+	    !ieee80211_hw_check(&local->hw, SW_CRYPTO_CONTROL)) {
 		hw->wiphy->interface_modes |= BIT(NL80211_IFTYPE_AP_VLAN);
 		hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_AP_VLAN);
 	}
-- 
cgit v1.2.3


From 4d191c75365a0067a9d5b8c8746b1bd9310c5a70 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 29 Mar 2018 11:14:30 +0200
Subject: mac80211: remove shadowing duplicated variable

We already have 'ifmgd' here, and it's already assigned
to the same value, so remove the duplicate.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index a3bcf953d423..d2bc52046729 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2011,8 +2011,6 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
 
 	/* deauthenticate/disassociate now */
 	if (tx || frame_buf) {
-		struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-
 		/*
 		 * In multi channel scenarios guarantee that the virtual
 		 * interface is granted immediate airtime to transmit the
-- 
cgit v1.2.3


From 6a671a50f8199b3e1fe49fa8afff0fc8335da79c Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:41 -0500
Subject: nl80211: Add CMD_CONTROL_PORT_FRAME API

This commit also adds cfg80211_rx_control_port function.  This is used
to generate a CMD_CONTROL_PORT_FRAME event out to userspace.  The
conn_owner_nlportid is used as the unicast destination.  This means that
userspace must specify NL80211_ATTR_SOCKET_OWNER flag if control port
over nl80211 routing is requested in NL80211_CMD_CONNECT,
NL80211_CMD_ASSOCIATE, NL80211_CMD_START_AP or IBSS/mesh join.

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
[johannes: fix return value of cfg80211_rx_control_port()]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/wireless/trace.h   | 21 ++++++++++++++++++
 2 files changed, 79 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index cfaf2aeb9783..0870447fbd55 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14553,6 +14553,64 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
 }
 EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
 
+static int __nl80211_rx_control_port(struct net_device *dev,
+				     const u8 *buf, size_t len,
+				     const u8 *addr, u16 proto,
+				     bool unencrypted, gfp_t gfp)
+{
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+	struct sk_buff *msg;
+	void *hdr;
+	u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid);
+
+	if (!nlportid)
+		return -ENOENT;
+
+	msg = nlmsg_new(100 + len, gfp);
+	if (!msg)
+		return -ENOMEM;
+
+	hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_CONTROL_PORT_FRAME);
+	if (!hdr) {
+		nlmsg_free(msg);
+		return -ENOBUFS;
+	}
+
+	if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+	    nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
+	    nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+			      NL80211_ATTR_PAD) ||
+	    nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
+	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
+	    nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) ||
+	    (unencrypted && nla_put_flag(msg,
+					 NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT)))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+
+	return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
+
+ nla_put_failure:
+	nlmsg_free(msg);
+	return -ENOBUFS;
+}
+
+bool cfg80211_rx_control_port(struct net_device *dev,
+			      const u8 *buf, size_t len,
+			      const u8 *addr, u16 proto, bool unencrypted)
+{
+	int ret;
+
+	trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted);
+	ret = __nl80211_rx_control_port(dev, buf, len, addr, proto,
+					unencrypted, GFP_ATOMIC);
+	trace_cfg80211_return_bool(ret == 0);
+	return ret == 0;
+}
+EXPORT_SYMBOL(cfg80211_rx_control_port);
+
 static struct sk_buff *cfg80211_prepare_cqm(struct net_device *dev,
 					    const char *mac, gfp_t gfp)
 {
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 5152938b358d..42fd338f879e 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2600,6 +2600,27 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
 		  WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack))
 );
 
+TRACE_EVENT(cfg80211_rx_control_port,
+	TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len,
+		 const u8 *addr, u16 proto, bool unencrypted),
+	TP_ARGS(netdev, buf, len, addr, proto, unencrypted),
+	TP_STRUCT__entry(
+		NETDEV_ENTRY
+		MAC_ENTRY(addr)
+		__field(u16, proto)
+		__field(bool, unencrypted)
+	),
+	TP_fast_assign(
+		NETDEV_ASSIGN;
+		MAC_ASSIGN(addr, addr);
+		__entry->proto = proto;
+		__entry->unencrypted = unencrypted;
+	),
+	TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s",
+		  NETDEV_PR_ARG, MAC_PR_ARG(addr),
+		  __entry->proto, BOOL_TO_STR(__entry->unencrypted))
+);
+
 TRACE_EVENT(cfg80211_cqm_rssi_notify,
 	TP_PROTO(struct net_device *netdev,
 		 enum nl80211_cqm_rssi_threshold_event rssi_event,
-- 
cgit v1.2.3


From 2576a9ace47eba28a682d249d1d6402f891808c9 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:42 -0500
Subject: nl80211: Implement TX of control port frames

This commit implements the TX side of NL80211_CMD_CONTROL_PORT_FRAME.
Userspace provides the raw EAPoL frame using NL80211_ATTR_FRAME.
Userspace should also provide the destination address and the protocol
type to use when sending the frame.  This is used to implement TX of
Pre-authentication frames.  If CONTROL_PORT_ETHERTYPE_NO_ENCRYPT is
specified, then the driver will be asked not to encrypt the outgoing
frame.

A new EXT_FEATURE flag is introduced so that nl80211 code can check
whether a given wiphy has capability to pass EAPoL frames over nl80211.

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c  | 71 ++++++++++++++++++++++++++++++++++++++++++++++++-
 net/wireless/rdev-ops.h | 15 +++++++++++
 net/wireless/trace.h    | 26 ++++++++++++++++++
 3 files changed, 111 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 0870447fbd55..6eb286784924 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -12535,6 +12535,68 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info)
 	return rdev_external_auth(rdev, dev, &params);
 }
 
+static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info)
+{
+	struct cfg80211_registered_device *rdev = info->user_ptr[0];
+	struct net_device *dev = info->user_ptr[1];
+	struct wireless_dev *wdev = dev->ieee80211_ptr;
+	const u8 *buf;
+	size_t len;
+	u8 *dest;
+	u16 proto;
+	bool noencrypt;
+	int err;
+
+	if (!wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211))
+		return -EOPNOTSUPP;
+
+	if (!rdev->ops->tx_control_port)
+		return -EOPNOTSUPP;
+
+	if (!info->attrs[NL80211_ATTR_FRAME] ||
+	    !info->attrs[NL80211_ATTR_MAC] ||
+	    !info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) {
+		GENL_SET_ERR_MSG(info, "Frame, MAC or ethertype missing");
+		return -EINVAL;
+	}
+
+	wdev_lock(wdev);
+
+	switch (wdev->iftype) {
+	case NL80211_IFTYPE_AP:
+	case NL80211_IFTYPE_P2P_GO:
+	case NL80211_IFTYPE_MESH_POINT:
+		break;
+	case NL80211_IFTYPE_ADHOC:
+	case NL80211_IFTYPE_STATION:
+	case NL80211_IFTYPE_P2P_CLIENT:
+		if (wdev->current_bss)
+			break;
+		err = -ENOTCONN;
+		goto out;
+	default:
+		err = -EOPNOTSUPP;
+		goto out;
+	}
+
+	wdev_unlock(wdev);
+
+	buf = nla_data(info->attrs[NL80211_ATTR_FRAME]);
+	len = nla_len(info->attrs[NL80211_ATTR_FRAME]);
+	dest = nla_data(info->attrs[NL80211_ATTR_MAC]);
+	proto = nla_get_u16(info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]);
+	noencrypt =
+		nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]);
+
+	return rdev_tx_control_port(rdev, dev, buf, len,
+				    dest, cpu_to_be16(proto), noencrypt);
+
+ out:
+	wdev_unlock(wdev);
+	return err;
+}
+
 #define NL80211_FLAG_NEED_WIPHY		0x01
 #define NL80211_FLAG_NEED_NETDEV	0x02
 #define NL80211_FLAG_NEED_RTNL		0x04
@@ -13438,7 +13500,14 @@ static const struct genl_ops nl80211_ops[] = {
 		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
 				  NL80211_FLAG_NEED_RTNL,
 	},
-
+	{
+		.cmd = NL80211_CMD_CONTROL_PORT_FRAME,
+		.doit = nl80211_tx_control_port,
+		.policy = nl80211_policy,
+		.flags = GENL_UNS_ADMIN_PERM,
+		.internal_flags = NL80211_FLAG_NEED_NETDEV_UP |
+				  NL80211_FLAG_NEED_RTNL,
+	},
 };
 
 static struct genl_family nl80211_fam __ro_after_init = {
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index 84f23ae015fc..87479a53411b 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -714,6 +714,21 @@ static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev,
 	return ret;
 }
 
+static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev,
+				       struct net_device *dev,
+				       const void *buf, size_t len,
+				       const u8 *dest, __be16 proto,
+				       const bool noencrypt)
+{
+	int ret;
+	trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len,
+				   dest, proto, noencrypt);
+	ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len,
+					 dest, proto, noencrypt);
+	trace_rdev_return_int(&rdev->wiphy, ret);
+	return ret;
+}
+
 static inline int
 rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev,
 			 struct wireless_dev *wdev, u64 cookie)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 42fd338f879e..a64291ae52a6 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1882,6 +1882,32 @@ TRACE_EVENT(rdev_mgmt_tx,
 		  BOOL_TO_STR(__entry->dont_wait_for_ack))
 );
 
+TRACE_EVENT(rdev_tx_control_port,
+	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+		 const u8 *buf, size_t len, const u8 *dest, __be16 proto,
+		 bool unencrypted),
+	TP_ARGS(wiphy, netdev, buf, len, dest, proto, unencrypted),
+	TP_STRUCT__entry(
+		WIPHY_ENTRY
+		NETDEV_ENTRY
+		MAC_ENTRY(dest)
+		__field(__be16, proto)
+		__field(bool, unencrypted)
+	),
+	TP_fast_assign(
+		WIPHY_ASSIGN;
+		NETDEV_ASSIGN;
+		MAC_ASSIGN(dest, dest);
+		__entry->proto = proto;
+		__entry->unencrypted = unencrypted;
+	),
+	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " MAC_PR_FMT ","
+		  " proto: 0x%x, unencrypted: %s",
+		  WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(dest),
+		  be16_to_cpu(__entry->proto),
+		  BOOL_TO_STR(__entry->unencrypted))
+);
+
 TRACE_EVENT(rdev_set_noack_map,
 	TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
 		 u16 noack_map),
-- 
cgit v1.2.3


From 64bf3d4bc2b0725b3c5ffadd982a9746bfc738b7 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:43 -0500
Subject: nl80211: Add CONTROL_PORT_OVER_NL80211 attribute

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 6eb286784924..d3b14d9d002a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -287,6 +287,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG },
 	[NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 },
 	[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG },
+	[NL80211_ATTR_CONTROL_PORT_OVER_NL80211] = { .type = NLA_FLAG },
 	[NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG },
 	[NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 },
 	[NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 },
@@ -8211,6 +8212,22 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static int validate_pae_over_nl80211(struct cfg80211_registered_device *rdev,
+				     struct genl_info *info)
+{
+	if (!info->attrs[NL80211_ATTR_SOCKET_OWNER]) {
+		GENL_SET_ERR_MSG(info, "SOCKET_OWNER not set");
+		return -EINVAL;
+	}
+
+	if (!rdev->ops->tx_control_port ||
+	    !wiphy_ext_feature_isset(&rdev->wiphy,
+				     NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 				   struct genl_info *info,
 				   struct cfg80211_crypto_settings *settings,
@@ -8234,6 +8251,15 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev,
 	} else
 		settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE);
 
+	if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) {
+		int r = validate_pae_over_nl80211(rdev, info);
+
+		if (r < 0)
+			return r;
+
+		settings->control_port_over_nl80211 = true;
+	}
+
 	if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) {
 		void *data;
 		int len, i;
-- 
cgit v1.2.3


From c3bfe1f6fc98e7185ff5ee9279ba259fe484597c Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:48 -0500
Subject: nl80211: Add control_port_over_nl80211 for ibss

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d3b14d9d002a..f8e10408f2b3 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -8705,6 +8705,15 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
 	ibss.control_port =
 		nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT]);
 
+	if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) {
+		int r = validate_pae_over_nl80211(rdev, info);
+
+		if (r < 0)
+			return r;
+
+		ibss.control_port_over_nl80211 = true;
+	}
+
 	ibss.userspace_handles_dfs =
 		nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);
 
-- 
cgit v1.2.3


From 1224f5831a22977f30c1842874be12c58608cee7 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:49 -0500
Subject: nl80211: Add control_port_over_nl80211 to mesh_setup

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/wireless/nl80211.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'net')

diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f8e10408f2b3..ff28f8feeb09 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -10168,6 +10168,15 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info)
 	setup.userspace_handles_dfs =
 		nla_get_flag(info->attrs[NL80211_ATTR_HANDLE_DFS]);
 
+	if (info->attrs[NL80211_ATTR_CONTROL_PORT_OVER_NL80211]) {
+		int r = validate_pae_over_nl80211(rdev, info);
+
+		if (r < 0)
+			return r;
+
+		setup.control_port_over_nl80211 = true;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	err = __cfg80211_join_mesh(rdev, dev, &setup, &cfg);
 	if (!err && info->attrs[NL80211_ATTR_SOCKET_OWNER])
-- 
cgit v1.2.3


From 911806491425d79107cadddbde11b42bbdfe38c8 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:50 -0500
Subject: mac80211: Add support for tx_control_port

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c         |  1 +
 net/mac80211/ieee80211_i.h |  3 +++
 net/mac80211/tx.c          | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 36d128ebbac8..f6b8d59a7ee8 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3791,4 +3791,5 @@ const struct cfg80211_ops mac80211_config_ops = {
 	.add_nan_func = ieee80211_add_nan_func,
 	.del_nan_func = ieee80211_del_nan_func,
 	.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
+	.tx_control_port = ieee80211_tx_control_port,
 };
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 5b3d78362fb5..275d6269b4a7 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1735,6 +1735,9 @@ void ieee80211_check_fast_xmit(struct sta_info *sta);
 void ieee80211_check_fast_xmit_all(struct ieee80211_local *local);
 void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata);
 void ieee80211_clear_fast_xmit(struct sta_info *sta);
+int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
+			      const u8 *buf, size_t len,
+			      const u8 *dest, __be16 proto, bool unencrypted);
 
 /* HT */
 void ieee80211_apply_htcap_overrides(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 933c67b5f845..535de3161a78 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -4757,3 +4757,49 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
 	ieee80211_xmit(sdata, NULL, skb);
 	local_bh_enable();
 }
+
+int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
+			      const u8 *buf, size_t len,
+			      const u8 *dest, __be16 proto, bool unencrypted)
+{
+	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *skb;
+	struct ethhdr *ehdr;
+	u32 flags;
+
+	/* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE
+	 * or Pre-Authentication
+	 */
+	if (proto != sdata->control_port_protocol &&
+	    proto != cpu_to_be16(ETH_P_PREAUTH))
+		return -EINVAL;
+
+	if (unencrypted)
+		flags = IEEE80211_TX_INTFL_DONT_ENCRYPT;
+	else
+		flags = 0;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom +
+			    sizeof(struct ethhdr) + len);
+	if (!skb)
+		return -ENOMEM;
+
+	skb_reserve(skb, local->hw.extra_tx_headroom + sizeof(struct ethhdr));
+
+	skb_put_data(skb, buf, len);
+
+	ehdr = skb_push(skb, sizeof(struct ethhdr));
+	memcpy(ehdr->h_dest, dest, ETH_ALEN);
+	memcpy(ehdr->h_source, sdata->vif.addr, ETH_ALEN);
+	ehdr->h_proto = proto;
+
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_802_3);
+	skb_reset_network_header(skb);
+	skb_reset_mac_header(skb);
+
+	__ieee80211_subif_start_xmit(skb, skb->dev, flags);
+
+	return 0;
+}
-- 
cgit v1.2.3


From 018f6fbf540d7bd7223b7d0b29651c1dd5e1c606 Mon Sep 17 00:00:00 2001
From: Denis Kenzior <denkenz@gmail.com>
Date: Mon, 26 Mar 2018 12:52:51 -0500
Subject: mac80211: Send control port frames over nl80211

If userspace requested control port frames to go over 80211, then do so.
The control packets are intercepted just prior to delivery of the packet
to the underlying network device.

Pre-authentication type frames (protocol: 0x88c7) are also forwarded
over nl80211.

Signed-off-by: Denis Kenzior <denkenz@gmail.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/cfg.c         |  6 ++++++
 net/mac80211/ibss.c        |  1 +
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/iface.c       |  2 ++
 net/mac80211/main.c        |  2 ++
 net/mac80211/mlme.c        |  2 ++
 net/mac80211/rx.c          | 33 ++++++++++++++++++++++++++++-----
 7 files changed, 42 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index f6b8d59a7ee8..85dbaa891059 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -926,6 +926,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 	 */
 	sdata->control_port_protocol = params->crypto.control_port_ethertype;
 	sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt;
+	sdata->control_port_over_nl80211 =
+				params->crypto.control_port_over_nl80211;
 	sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local,
 							&params->crypto,
 							sdata->vif.type);
@@ -935,6 +937,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
 			params->crypto.control_port_ethertype;
 		vlan->control_port_no_encrypt =
 			params->crypto.control_port_no_encrypt;
+		vlan->control_port_over_nl80211 =
+			params->crypto.control_port_over_nl80211;
 		vlan->encrypt_headroom =
 			ieee80211_cs_headroom(sdata->local,
 					      &params->crypto,
@@ -2020,6 +2024,8 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
 	if (err)
 		return err;
 
+	sdata->control_port_over_nl80211 = setup->control_port_over_nl80211;
+
 	/* can mesh use other SMPS modes? */
 	sdata->smps_mode = IEEE80211_SMPS_OFF;
 	sdata->needed_rx_chains = sdata->local->rx_chains;
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index dc582aa35c89..6449a1c2283b 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -1844,6 +1844,7 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata,
 
 	sdata->smps_mode = IEEE80211_SMPS_OFF;
 	sdata->needed_rx_chains = local->rx_chains;
+	sdata->control_port_over_nl80211 = params->control_port_over_nl80211;
 
 	ieee80211_queue_work(&local->hw, &sdata->work);
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 275d6269b4a7..6372dbdadf53 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -900,6 +900,7 @@ struct ieee80211_sub_if_data {
 	u16 sequence_number;
 	__be16 control_port_protocol;
 	bool control_port_no_encrypt;
+	bool control_port_over_nl80211;
 	int encrypt_headroom;
 
 	atomic_t num_tx_queued;
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d13ba064951f..555e389b7dfa 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -519,6 +519,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
 			master->control_port_protocol;
 		sdata->control_port_no_encrypt =
 			master->control_port_no_encrypt;
+		sdata->control_port_over_nl80211 =
+			master->control_port_over_nl80211;
 		sdata->vif.cab_queue = master->vif.cab_queue;
 		memcpy(sdata->vif.hw_queue, master->vif.hw_queue,
 		       sizeof(sdata->vif.hw_queue));
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 8d0333b5355b..9ea17afaa237 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -554,6 +554,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
 			   NL80211_FEATURE_USERSPACE_MPM |
 			   NL80211_FEATURE_FULL_AP_CLIENT_STATE;
 	wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_FILS_STA);
+	wiphy_ext_feature_set(wiphy,
+			      NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211);
 
 	if (!ops->hw_scan)
 		wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN |
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index d2bc52046729..20d2b186d740 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4855,6 +4855,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
 
 	sdata->control_port_protocol = req->crypto.control_port_ethertype;
 	sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt;
+	sdata->control_port_over_nl80211 =
+					req->crypto.control_port_over_nl80211;
 	sdata->encrypt_headroom = ieee80211_cs_headroom(local, &req->crypto,
 							sdata->vif.type);
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 3a9f0c0a2de8..03102aff0953 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2245,6 +2245,32 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc)
 	return true;
 }
 
+static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
+						 struct ieee80211_rx_data *rx)
+{
+	struct ieee80211_sub_if_data *sdata = rx->sdata;
+	struct net_device *dev = sdata->dev;
+
+	if (unlikely((skb->protocol == sdata->control_port_protocol ||
+		      skb->protocol == cpu_to_be16(ETH_P_PREAUTH)) &&
+		     sdata->control_port_over_nl80211)) {
+		struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+		bool noencrypt = status->flag & RX_FLAG_DECRYPTED;
+		struct ethhdr *ehdr = eth_hdr(skb);
+
+		cfg80211_rx_control_port(dev, skb->data, skb->len,
+					 ehdr->h_source,
+					 be16_to_cpu(skb->protocol), noencrypt);
+		dev_kfree_skb(skb);
+	} else {
+		/* deliver to local stack */
+		if (rx->napi)
+			napi_gro_receive(rx->napi, skb);
+		else
+			netif_receive_skb(skb);
+	}
+}
+
 /*
  * requires that rx->skb is a frame with ethernet header
  */
@@ -2329,13 +2355,10 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
 #endif
 
 	if (skb) {
-		/* deliver to local stack */
 		skb->protocol = eth_type_trans(skb, dev);
 		memset(skb->cb, 0, sizeof(skb->cb));
-		if (rx->napi)
-			napi_gro_receive(rx->napi, skb);
-		else
-			netif_receive_skb(skb);
+
+		ieee80211_deliver_skb_to_local_stack(skb, rx);
 	}
 
 	if (xmit_skb) {
-- 
cgit v1.2.3


From c470bdc1aaf36669e04ba65faf1092b2d1c6cabe Mon Sep 17 00:00:00 2001
From: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Date: Mon, 26 Mar 2018 16:21:04 +0300
Subject: mac80211: don't WARN on bad WMM parameters from buggy APs

Apparently, some APs are buggy enough to send a zeroed
WMM IE. Don't WARN on this since this is not caused by a bug
on the client's system.

This aligns the condition of the WARNING in drv_conf_tx
with the validity check in ieee80211_sta_wmm_params.
We will now pick the default values whenever we get
a zeroed WMM IE.

This has been reported here:
https://bugzilla.kernel.org/show_bug.cgi?id=199161

Fixes: f409079bb678 ("mac80211: sanity check CW_min/CW_max towards driver")
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/mlme.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 20d2b186d740..07b58d20e89c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1786,7 +1786,8 @@ static bool ieee80211_sta_wmm_params(struct ieee80211_local *local,
 		params[ac].acm = acm;
 		params[ac].uapsd = uapsd;
 
-		if (params[ac].cw_min > params[ac].cw_max) {
+		if (params->cw_min == 0 ||
+		    params[ac].cw_min > params[ac].cw_max) {
 			sdata_info(sdata,
 				   "AP has invalid WMM params (CWmin/max=%d/%d for ACI %d), using defaults\n",
 				   params[ac].cw_min, params[ac].cw_max, aci);
-- 
cgit v1.2.3


From f0b07bb151b098d291fd1fd71ef7a2df56fb124a Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 29 Mar 2018 19:20:32 +0300
Subject: net: Introduce net_rwsem to protect net_namespace_list

rtnl_lock() is used everywhere, and contention is very high.
When someone wants to iterate over alive net namespaces,
he/she has no a possibility to do that without exclusive lock.
But the exclusive rtnl_lock() in such places is overkill,
and it just increases the contention. Yes, there is already
for_each_net_rcu() in kernel, but it requires rcu_read_lock(),
and this can't be sleepable. Also, sometimes it may be need
really prevent net_namespace_list growth, so for_each_net_rcu()
is not fit there.

This patch introduces new rw_semaphore, which will be used
instead of rtnl_mutex to protect net_namespace_list. It is
sleepable and allows not-exclusive iterations over net
namespaces list. It allows to stop using rtnl_lock()
in several places (what is made in next patches) and makes
less the time, we keep rtnl_mutex. Here we just add new lock,
while the explanation of we can remove rtnl_lock() there are
in next patches.

Fine grained locks generally are better, then one big lock,
so let's do that with net_namespace_list, while the situation
allows that.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c                    |  5 +++++
 net/core/fib_notifier.c           |  2 ++
 net/core/net_namespace.c          | 18 +++++++++++++-----
 net/core/rtnetlink.c              |  5 +++++
 net/netfilter/nf_conntrack_core.c |  2 ++
 net/openvswitch/datapath.c        |  2 ++
 net/wireless/wext-core.c          |  2 ++
 7 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index e13807b5c84d..eca5458b2753 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1629,6 +1629,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 		goto unlock;
 	if (dev_boot_phase)
 		goto unlock;
+	down_read(&net_rwsem);
 	for_each_net(net) {
 		for_each_netdev(net, dev) {
 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
@@ -1642,6 +1643,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 			call_netdevice_notifier(nb, NETDEV_UP, dev);
 		}
 	}
+	up_read(&net_rwsem);
 
 unlock:
 	rtnl_unlock();
@@ -1664,6 +1666,7 @@ rollback:
 	}
 
 outroll:
+	up_read(&net_rwsem);
 	raw_notifier_chain_unregister(&netdev_chain, nb);
 	goto unlock;
 }
@@ -1694,6 +1697,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	if (err)
 		goto unlock;
 
+	down_read(&net_rwsem);
 	for_each_net(net) {
 		for_each_netdev(net, dev) {
 			if (dev->flags & IFF_UP) {
@@ -1704,6 +1708,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 		}
 	}
+	up_read(&net_rwsem);
 unlock:
 	rtnl_unlock();
 	return err;
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 0c048bdeb016..614b985c92a4 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -33,6 +33,7 @@ static unsigned int fib_seq_sum(void)
 	struct net *net;
 
 	rtnl_lock();
+	down_read(&net_rwsem);
 	for_each_net(net) {
 		rcu_read_lock();
 		list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
@@ -43,6 +44,7 @@ static unsigned int fib_seq_sum(void)
 		}
 		rcu_read_unlock();
 	}
+	up_read(&net_rwsem);
 	rtnl_unlock();
 
 	return fib_seq;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b5796d17a302..7fdf321d4997 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -33,6 +33,10 @@ static struct list_head *first_device = &pernet_list;
 LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
 
+/* Protects net_namespace_list. Nests iside rtnl_lock() */
+DECLARE_RWSEM(net_rwsem);
+EXPORT_SYMBOL_GPL(net_rwsem);
+
 struct net init_net = {
 	.count		= REFCOUNT_INIT(1),
 	.dev_base_head	= LIST_HEAD_INIT(init_net.dev_base_head),
@@ -309,9 +313,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 		if (error < 0)
 			goto out_undo;
 	}
-	rtnl_lock();
+	down_write(&net_rwsem);
 	list_add_tail_rcu(&net->list, &net_namespace_list);
-	rtnl_unlock();
+	up_write(&net_rwsem);
 out:
 	return error;
 
@@ -450,7 +454,7 @@ static void unhash_nsid(struct net *net, struct net *last)
 	 * and this work is the only process, that may delete
 	 * a net from net_namespace_list. So, when the below
 	 * is executing, the list may only grow. Thus, we do not
-	 * use for_each_net_rcu() or rtnl_lock().
+	 * use for_each_net_rcu() or net_rwsem.
 	 */
 	for_each_net(tmp) {
 		int id;
@@ -485,7 +489,7 @@ static void cleanup_net(struct work_struct *work)
 	down_read(&pernet_ops_rwsem);
 
 	/* Don't let anyone else find us. */
-	rtnl_lock();
+	down_write(&net_rwsem);
 	llist_for_each_entry(net, net_kill_list, cleanup_list)
 		list_del_rcu(&net->list);
 	/* Cache last net. After we unlock rtnl, no one new net
@@ -499,7 +503,7 @@ static void cleanup_net(struct work_struct *work)
 	 * useless anyway, as netns_ids are destroyed there.
 	 */
 	last = list_last_entry(&net_namespace_list, struct net, list);
-	rtnl_unlock();
+	up_write(&net_rwsem);
 
 	llist_for_each_entry(net, net_kill_list, cleanup_list) {
 		unhash_nsid(net, last);
@@ -900,6 +904,9 @@ static int __register_pernet_operations(struct list_head *list,
 
 	list_add_tail(&ops->list, list);
 	if (ops->init || (ops->id && ops->size)) {
+		/* We held write locked pernet_ops_rwsem, and parallel
+		 * setup_net() and cleanup_net() are not possible.
+		 */
 		for_each_net(net) {
 			error = ops_init(ops, net);
 			if (error)
@@ -923,6 +930,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
 	LIST_HEAD(net_exit_list);
 
 	list_del(&ops->list);
+	/* See comment in __register_pernet_operations() */
 	for_each_net(net)
 		list_add_tail(&net->exit_list, &net_exit_list);
 	ops_exit_list(ops, &net_exit_list);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2d3949789cef..e86b28482ca7 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -418,9 +418,11 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
 	struct net *net;
 
+	down_read(&net_rwsem);
 	for_each_net(net) {
 		__rtnl_kill_links(net, ops);
 	}
+	up_read(&net_rwsem);
 	list_del(&ops->list);
 }
 EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
@@ -438,6 +440,9 @@ static void rtnl_lock_unregistering_all(void)
 	for (;;) {
 		unregistering = false;
 		rtnl_lock();
+		/* We held write locked pernet_ops_rwsem, and parallel
+		 * setup_net() and cleanup_net() are not possible.
+		 */
 		for_each_net(net) {
 			if (net->dev_unreg_count > 0) {
 				unregistering = true;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 705198de671d..370f9b7f051b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1764,12 +1764,14 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
 	struct net *net;
 
 	rtnl_lock();
+	down_read(&net_rwsem);
 	for_each_net(net) {
 		if (atomic_read(&net->ct.count) == 0)
 			continue;
 		__nf_ct_unconfirmed_destroy(net);
 		nf_queue_nf_hook_drop(net);
 	}
+	up_read(&net_rwsem);
 	rtnl_unlock();
 
 	/* Need to wait for netns cleanup worker to finish, if its
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ef38e5aecd28..9746ee30a99b 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2364,8 +2364,10 @@ static void __net_exit ovs_exit_net(struct net *dnet)
 		__dp_destroy(dp);
 
 	rtnl_lock();
+	down_read(&net_rwsem);
 	for_each_net(net)
 		list_vports_from_net(net, dnet, &head);
+	up_read(&net_rwsem);
 	rtnl_unlock();
 
 	/* Detach all vports from given namespace. */
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 9efbfc753347..544d7b62d7ca 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -349,11 +349,13 @@ void wireless_nlevent_flush(void)
 
 	ASSERT_RTNL();
 
+	down_read(&net_rwsem);
 	for_each_net(net) {
 		while ((skb = skb_dequeue(&net->wext_nlevents)))
 			rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
 				    GFP_KERNEL);
 	}
+	up_read(&net_rwsem);
 }
 EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
 
-- 
cgit v1.2.3


From 10256debb918aea083d0ddada64d29014c642a7b Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 29 Mar 2018 19:20:44 +0300
Subject: net: Don't take rtnl_lock() in wireless_nlevent_flush()

This function iterates over net_namespace_list and flushes
the queue for every of them. What does this rtnl_lock()
protects?! Since we may add skbs to net::wext_nlevents
without rtnl_lock(), it does not protects us about queuers.

It guarantees, two threads can't flush the queue in parallel,
that can change the order, but since skb can be queued
in any order, it doesn't matter, how many threads do this
in parallel. In case of several threads, this will be even
faster.

So, we can remove rtnl_lock() here, as it was used for
iteration over net_namespace_list only.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/wireless/wext-core.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 544d7b62d7ca..5e677dac2a0c 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -347,8 +347,6 @@ void wireless_nlevent_flush(void)
 	struct sk_buff *skb;
 	struct net *net;
 
-	ASSERT_RTNL();
-
 	down_read(&net_rwsem);
 	for_each_net(net) {
 		while ((skb = skb_dequeue(&net->wext_nlevents)))
@@ -412,9 +410,7 @@ subsys_initcall(wireless_nlevent_init);
 /* Process events generated by the wireless layer or the driver. */
 static void wireless_nlevent_process(struct work_struct *work)
 {
-	rtnl_lock();
 	wireless_nlevent_flush();
-	rtnl_unlock();
 }
 
 static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process);
-- 
cgit v1.2.3


From ec9c780925c57588637e1dbd8650d294107311c0 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 29 Mar 2018 19:21:09 +0300
Subject: ovs: Remove rtnl_lock() from ovs_exit_net()

Here we iterate for_each_net() and removes
vport from alive net to the exiting net.

ovs_net::dps are protected by ovs_mutex(),
and the others, who change it (ovs_dp_cmd_new(),
__dp_destroy()) also take it.
The same with datapath::ports list.

So, we remove rtnl_lock() here.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 9746ee30a99b..015e24e08909 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2363,12 +2363,10 @@ static void __net_exit ovs_exit_net(struct net *dnet)
 	list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node)
 		__dp_destroy(dp);
 
-	rtnl_lock();
 	down_read(&net_rwsem);
 	for_each_net(net)
 		list_vports_from_net(net, dnet, &head);
 	up_read(&net_rwsem);
-	rtnl_unlock();
 
 	/* Detach all vports from given namespace. */
 	list_for_each_entry_safe(vport, vport_next, &head, detach_list) {
-- 
cgit v1.2.3


From 152f253152cc08f5d26ba76659723d2d83b363f8 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 29 Mar 2018 19:21:20 +0300
Subject: net: Remove rtnl_lock() in nf_ct_iterate_destroy()

rtnl_lock() doesn't protect net::ct::count,
and it's not needed for__nf_ct_unconfirmed_destroy()
and for nf_queue_nf_hook_drop().

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 370f9b7f051b..41ff04ee2554 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1763,7 +1763,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
 {
 	struct net *net;
 
-	rtnl_lock();
 	down_read(&net_rwsem);
 	for_each_net(net) {
 		if (atomic_read(&net->ct.count) == 0)
@@ -1772,7 +1771,6 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
 		nf_queue_nf_hook_drop(net);
 	}
 	up_read(&net_rwsem);
-	rtnl_unlock();
 
 	/* Need to wait for netns cleanup worker to finish, if its
 	 * running -- it might have deleted a net namespace from
-- 
cgit v1.2.3


From c30d9356e9e8ed0735c1215e187b03d3ae8b4966 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 27 Mar 2018 18:21:55 -0700
Subject: net: Fix fib notifer to return errno

Notifier handlers use notifier_from_errno to convert any potential error
to an encoded format. As a consequence the other side, call_fib_notifier{s}
in this case, needs to use notifier_to_errno to return the error from
the handler back to its caller.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_notifier.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 614b985c92a4..13a40b831d6d 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -13,16 +13,22 @@ int call_fib_notifier(struct notifier_block *nb, struct net *net,
 		      enum fib_event_type event_type,
 		      struct fib_notifier_info *info)
 {
+	int err;
+
 	info->net = net;
-	return nb->notifier_call(nb, event_type, info);
+	err = nb->notifier_call(nb, event_type, info);
+	return notifier_to_errno(err);
 }
 EXPORT_SYMBOL(call_fib_notifier);
 
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
 		       struct fib_notifier_info *info)
 {
+	int err;
+
 	info->net = net;
-	return atomic_notifier_call_chain(&fib_chain, event_type, info);
+	err = atomic_notifier_call_chain(&fib_chain, event_type, info);
+	return notifier_to_errno(err);
 }
 EXPORT_SYMBOL(call_fib_notifiers);
 
-- 
cgit v1.2.3


From 9776d32537d2bbc251fd1de651e2bb2439474bde Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 27 Mar 2018 18:21:56 -0700
Subject: net: Move call_fib_rule_notifiers up in fib_nl_newrule

Move call_fib_rule_notifiers up in fib_nl_newrule to the point right
before the rule is inserted into the list. At this point there are no
more failure paths within the core rule code, so if the notifier
does not fail then the rule will be inserted into the list.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/fib_rules.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 9d87ce868402..33958f84c173 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -631,6 +631,11 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err < 0)
 		goto errout_free;
 
+	err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops,
+				      extack);
+	if (err < 0)
+		goto errout_free;
+
 	list_for_each_entry(r, &ops->rules_list, list) {
 		if (r->pref > rule->pref)
 			break;
@@ -667,7 +672,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (rule->tun_id)
 		ip_tunnel_need_metadata();
 
-	call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops, extack);
 	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
 	flush_route_cache(ops);
 	rules_ops_put(ops);
-- 
cgit v1.2.3


From 6635f311eab40b6d97eb884f371be41d0f5a3ed6 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 27 Mar 2018 18:21:57 -0700
Subject: net/ipv4: Move call_fib_entry_notifiers up for new routes

Move call to call_fib_entry_notifiers for new IPv4 routes to right
before the call to fib_insert_alias. At this point the only remaining
failure path is memory allocations in fib_insert_node. Handle that
very unlikely failure with a call to call_fib_entry_notifiers to
tell drivers about it.

At this point notifier handlers can decide the fate of the new route
with a clean path to delete the potential new entry if the notifier
returns non-0.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index fac0b73e24d1..67116233e2bc 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1065,6 +1065,9 @@ noleaf:
 	return -ENOMEM;
 }
 
+/* fib notifier for ADD is sent before calling fib_insert_alias with
+ * the expectation that the only possible failure ENOMEM
+ */
 static int fib_insert_alias(struct trie *t, struct key_vector *tp,
 			    struct key_vector *l, struct fib_alias *new,
 			    struct fib_alias *fa, t_key key)
@@ -1263,21 +1266,32 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 	new_fa->tb_id = tb->tb_id;
 	new_fa->fa_default = -1;
 
+	err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
+	if (err)
+		goto out_free_new_fa;
+
 	/* Insert new entry to the list. */
 	err = fib_insert_alias(t, tp, l, new_fa, fa, key);
 	if (err)
-		goto out_free_new_fa;
+		goto out_fib_notif;
 
 	if (!plen)
 		tb->tb_num_default++;
 
 	rt_cache_flush(cfg->fc_nlinfo.nl_net);
-	call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
 	rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id,
 		  &cfg->fc_nlinfo, nlflags);
 succeeded:
 	return 0;
 
+out_fib_notif:
+	/* notifier was sent that entry would be added to trie, but
+	 * the add failed and need to recover. Only failure for
+	 * fib_insert_alias is ENOMEM.
+	 */
+	NL_SET_ERR_MSG(extack, "Failed to insert route into trie");
+	call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key,
+				 plen, new_fa, NULL);
 out_free_new_fa:
 	kmem_cache_free(fn_alias_kmem, new_fa);
 out:
-- 
cgit v1.2.3


From c1d7ee67acb54b7dc1408929ff70dfe46993e517 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 27 Mar 2018 18:21:58 -0700
Subject: net/ipv4: Allow notifier to fail route replace

Add checking to call to call_fib_entry_notifiers for IPv4 route replace.
Allows a notifier handler to fail the replace.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 67116233e2bc..3dcffd3ce98c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1219,8 +1219,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
 			new_fa->tb_id = tb->tb_id;
 			new_fa->fa_default = -1;
 
-			call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
-						 key, plen, new_fa, extack);
+			err = call_fib_entry_notifiers(net,
+						       FIB_EVENT_ENTRY_REPLACE,
+						       key, plen, new_fa,
+						       extack);
+			if (err)
+				goto out_free_new_fa;
+
 			rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
 				  tb->tb_id, &cfg->fc_nlinfo, nlflags);
 
-- 
cgit v1.2.3


From 2233000cba40ee0784a2d5b5e2b2c38c1159a7ef Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 27 Mar 2018 18:21:59 -0700
Subject: net/ipv6: Move call_fib6_entry_notifiers up for route adds

Move call to call_fib6_entry_notifiers for new IPv6 routes to right
before the insertion into the FIB. At this point notifier handlers can
decide the fate of the new route with a clean path to delete the
potential new entry if the notifier returns non-0.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Reviewed-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 908b8e5b615a..deab2db6692e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1007,12 +1007,16 @@ add:
 		if (err)
 			return err;
 
+		err = call_fib6_entry_notifiers(info->nl_net,
+						FIB_EVENT_ENTRY_ADD,
+						rt, extack);
+		if (err)
+			return err;
+
 		rcu_assign_pointer(rt->rt6_next, iter);
 		atomic_inc(&rt->rt6i_ref);
 		rcu_assign_pointer(rt->rt6i_node, fn);
 		rcu_assign_pointer(*ins, rt);
-		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD,
-					  rt, extack);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
 		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
@@ -1036,12 +1040,16 @@ add:
 		if (err)
 			return err;
 
+		err = call_fib6_entry_notifiers(info->nl_net,
+						FIB_EVENT_ENTRY_REPLACE,
+						rt, extack);
+		if (err)
+			return err;
+
 		atomic_inc(&rt->rt6i_ref);
 		rcu_assign_pointer(rt->rt6i_node, fn);
 		rt->rt6_next = iter->rt6_next;
 		rcu_assign_pointer(*ins, rt);
-		call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE,
-					  rt, extack);
 		if (!info->skip_notify)
 			inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
 		if (!(fn->fn_flags & RTN_RTINFO)) {
-- 
cgit v1.2.3


From 18dcbe12fe9fca0ab825f7eff993060525ac2503 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 27 Mar 2018 19:49:42 -0700
Subject: ipv6: export ip6 fragments sysctl to unprivileged users

IPv4 was changed in commit 52a773d645e9 ("net: Export ip fragment
sysctl to unprivileged users")

The only sysctl that is not per-netns is not used :
ip6frag_secret_interval

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index afbc000ad4f2..08a139f14d0f 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -650,10 +650,6 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
 		table[1].data = &net->ipv6.frags.low_thresh;
 		table[1].extra2 = &net->ipv6.frags.high_thresh;
 		table[2].data = &net->ipv6.frags.timeout;
-
-		/* Don't export sysctls to unprivileged users */
-		if (net->user_ns != &init_user_ns)
-			table[0].procname = NULL;
 	}
 
 	hdr = register_net_sysctl(net, "net/ipv6", table);
-- 
cgit v1.2.3


From 7ae665f132a62e67ccef1ef0994acba51abc2400 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 28 Mar 2018 16:14:56 +0200
Subject: sctp: fix unused lable warning

The proc file cleanup left a label possibly unused:

net/sctp/protocol.c: In function 'sctp_defaults_init':
net/sctp/protocol.c:1304:1: error: label 'err_init_proc' defined but not used [-Werror=unused-label]

This adds an #ifdef around it to match the respective 'goto'.

Fixes: d47d08c8ca05 ("sctp: use proc_remove_subtree()")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sctp/protocol.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 84a09f599131..a24cde236330 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1258,8 +1258,10 @@ static int __net_init sctp_defaults_init(struct net *net)
 
 	return 0;
 
+#ifdef CONFIG_PROC_FS
 err_init_proc:
 	cleanup_sctp_mibs(net);
+#endif
 err_init_mibs:
 	sctp_sysctl_net_unregister(net);
 err_sysctl_register:
-- 
cgit v1.2.3


From 8934ce2fd08171e8605f7fada91ee7619fe17ab8 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 28 Mar 2018 12:49:15 -0700
Subject: bpf: sockmap redirect ingress support

Add support for the BPF_F_INGRESS flag in sk_msg redirect helper.
To do this add a scatterlist ring for receiving socks to check
before calling into regular recvmsg call path. Additionally, because
the poll wakeup logic only checked the skb recv queue we need to
add a hook in TCP stack (similar to write side) so that we have
a way to wake up polling socks when a scatterlist is redirected
to that sock.

After this all that is needed is for the redirect helper to
push the scatterlist into the psock receive queue.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c |  2 +-
 net/ipv4/tcp.c    | 10 +++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index afd825534ac4..a5a995e5b380 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1894,7 +1894,7 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
 	   struct bpf_map *, map, u32, key, u64, flags)
 {
 	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags))
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
 	msg->key = key;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0c31be306572..bccc4c270087 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -485,6 +485,14 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
 	}
 }
 
+static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
+					  int target, struct sock *sk)
+{
+	return (tp->rcv_nxt - tp->copied_seq >= target) ||
+		(sk->sk_prot->stream_memory_read ?
+		sk->sk_prot->stream_memory_read(sk) : false);
+}
+
 /*
  *	Wait for a TCP event.
  *
@@ -554,7 +562,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		    tp->urg_data)
 			target++;
 
-		if (tp->rcv_nxt - tp->copied_seq >= target)
+		if (tcp_stream_is_readable(tp, target, sk))
 			mask |= EPOLLIN | EPOLLRDNORM;
 
 		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-- 
cgit v1.2.3


From fa246693a111fab32bd51d20f07a347e42773ee9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 28 Mar 2018 12:49:25 -0700
Subject: bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT:

Add support for the BPF_F_INGRESS flag in skb redirect helper. To
do this convert skb into a scatterlist and push into ingress queue.
This is the same logic that is used in the sk_msg redirect helper
so it should feel familiar.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index a5a995e5b380..e989bf313195 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1855,7 +1855,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags))
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
 	tcb->bpf.key = key;
-- 
cgit v1.2.3


From 39c202d228c3da5a5531be847e9b06cc9b787f31 Mon Sep 17 00:00:00 2001
From: Bernie Harris <bernie.harris@alliedtelesis.co.nz>
Date: Wed, 21 Mar 2018 15:42:15 +1300
Subject: netfilter: ebtables: Add support for specifying match revision

Currently ebtables assumes that the revision number of all match
modules is 0, which is an issue when trying to use existing
xtables matches with ebtables. The solution is to modify ebtables
to allow extensions to specify a revision number, similar to
iptables. This gets passed down to the kernel, which is then able
to find the match module correctly.

To main binary backwards compatibility, the size of the ebt_entry
structures is not changed, only the size of the name field is
decreased by 1 byte to make room for the revision field.

Signed-off-by: Bernie Harris <bernie.harris@alliedtelesis.co.nz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 47 +++++++++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 9a26d2b7420f..a8cb543e3296 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -356,12 +356,12 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
 	    left - sizeof(struct ebt_entry_match) < m->match_size)
 		return -EINVAL;
 
-	match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
+	match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision);
 	if (IS_ERR(match) || match->family != NFPROTO_BRIDGE) {
 		if (!IS_ERR(match))
 			module_put(match->me);
 		request_module("ebt_%s", m->u.name);
-		match = xt_find_match(NFPROTO_BRIDGE, m->u.name, 0);
+		match = xt_find_match(NFPROTO_BRIDGE, m->u.name, m->u.revision);
 	}
 	if (IS_ERR(match))
 		return PTR_ERR(match);
@@ -1350,16 +1350,17 @@ static int update_counters(struct net *net, const void __user *user,
 
 static inline int ebt_obj_to_user(char __user *um, const char *_name,
 				  const char *data, int entrysize,
-				  int usersize, int datasize)
+				  int usersize, int datasize, u8 revision)
 {
-	char name[EBT_FUNCTION_MAXNAMELEN] = {0};
+	char name[EBT_EXTENSION_MAXNAMELEN] = {0};
 
-	/* ebtables expects 32 bytes long names but xt_match names are 29 bytes
+	/* ebtables expects 31 bytes long names but xt_match names are 29 bytes
 	 * long. Copy 29 bytes and fill remaining bytes with zeroes.
 	 */
 	strlcpy(name, _name, sizeof(name));
-	if (copy_to_user(um, name, EBT_FUNCTION_MAXNAMELEN) ||
-	    put_user(datasize, (int __user *)(um + EBT_FUNCTION_MAXNAMELEN)) ||
+	if (copy_to_user(um, name, EBT_EXTENSION_MAXNAMELEN) ||
+	    put_user(revision, (u8 __user *)(um + EBT_EXTENSION_MAXNAMELEN)) ||
+	    put_user(datasize, (int __user *)(um + EBT_EXTENSION_MAXNAMELEN + 1)) ||
 	    xt_data_to_user(um + entrysize, data, usersize, datasize,
 			    XT_ALIGN(datasize)))
 		return -EFAULT;
@@ -1372,7 +1373,8 @@ static inline int ebt_match_to_user(const struct ebt_entry_match *m,
 {
 	return ebt_obj_to_user(ubase + ((char *)m - base),
 			       m->u.match->name, m->data, sizeof(*m),
-			       m->u.match->usersize, m->match_size);
+			       m->u.match->usersize, m->match_size,
+			       m->u.match->revision);
 }
 
 static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
@@ -1380,7 +1382,8 @@ static inline int ebt_watcher_to_user(const struct ebt_entry_watcher *w,
 {
 	return ebt_obj_to_user(ubase + ((char *)w - base),
 			       w->u.watcher->name, w->data, sizeof(*w),
-			       w->u.watcher->usersize, w->watcher_size);
+			       w->u.watcher->usersize, w->watcher_size,
+			       w->u.watcher->revision);
 }
 
 static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
@@ -1411,7 +1414,8 @@ static inline int ebt_entry_to_user(struct ebt_entry *e, const char *base,
 	if (ret != 0)
 		return ret;
 	ret = ebt_obj_to_user(hlp, t->u.target->name, t->data, sizeof(*t),
-			      t->u.target->usersize, t->target_size);
+			      t->u.target->usersize, t->target_size,
+			      t->u.target->revision);
 	if (ret != 0)
 		return ret;
 
@@ -1599,7 +1603,10 @@ struct compat_ebt_replace {
 /* struct ebt_entry_match, _target and _watcher have same layout */
 struct compat_ebt_entry_mwt {
 	union {
-		char name[EBT_FUNCTION_MAXNAMELEN];
+		struct {
+			char name[EBT_EXTENSION_MAXNAMELEN];
+			u8 revision;
+		};
 		compat_uptr_t ptr;
 	} u;
 	compat_uint_t match_size;
@@ -1638,8 +1645,9 @@ static int compat_match_to_user(struct ebt_entry_match *m, void __user **dstptr,
 
 	BUG_ON(off >= m->match_size);
 
-	if (copy_to_user(cm->u.name, match->name,
-	    strlen(match->name) + 1) || put_user(msize, &cm->match_size))
+	if (copy_to_user(cm->u.name, match->name, strlen(match->name) + 1) ||
+	    put_user(match->revision, &cm->u.revision) ||
+	    put_user(msize, &cm->match_size))
 		return -EFAULT;
 
 	if (match->compat_to_user) {
@@ -1668,8 +1676,9 @@ static int compat_target_to_user(struct ebt_entry_target *t,
 
 	BUG_ON(off >= t->target_size);
 
-	if (copy_to_user(cm->u.name, target->name,
-	    strlen(target->name) + 1) || put_user(tsize, &cm->match_size))
+	if (copy_to_user(cm->u.name, target->name, strlen(target->name) + 1) ||
+	    put_user(target->revision, &cm->u.revision) ||
+	    put_user(tsize, &cm->match_size))
 		return -EFAULT;
 
 	if (target->compat_to_user) {
@@ -1933,7 +1942,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
 				struct ebt_entries_buf_state *state,
 				const unsigned char *base)
 {
-	char name[EBT_FUNCTION_MAXNAMELEN];
+	char name[EBT_EXTENSION_MAXNAMELEN];
 	struct xt_match *match;
 	struct xt_target *wt;
 	void *dst = NULL;
@@ -1947,7 +1956,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
 
 	switch (compat_mwt) {
 	case EBT_COMPAT_MATCH:
-		match = xt_request_find_match(NFPROTO_BRIDGE, name, 0);
+		match = xt_request_find_match(NFPROTO_BRIDGE, name,
+					      mwt->u.revision);
 		if (IS_ERR(match))
 			return PTR_ERR(match);
 
@@ -1966,7 +1976,8 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
 		break;
 	case EBT_COMPAT_WATCHER: /* fallthrough */
 	case EBT_COMPAT_TARGET:
-		wt = xt_request_find_target(NFPROTO_BRIDGE, name, 0);
+		wt = xt_request_find_target(NFPROTO_BRIDGE, name,
+					    mwt->u.revision);
 		if (IS_ERR(wt))
 			return PTR_ERR(wt);
 		off = xt_compat_target_offset(wt);
-- 
cgit v1.2.3


From 1be3ac98444066a292de02c2c12e203bdf575e7d Mon Sep 17 00:00:00 2001
From: Bernie Harris <bernie.harris@alliedtelesis.co.nz>
Date: Wed, 21 Mar 2018 15:42:16 +1300
Subject: netfilter: ebtables: Add string filter

This patch is part of a proposal to add a string filter to
ebtables, which would be similar to the string filter in
iptables. Like iptables, the ebtables filter uses the xt_string
module.

Signed-off-by: Bernie Harris <bernie.harris@alliedtelesis.co.nz>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_string.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c
index 423293ee57c2..be1feddadcf0 100644
--- a/net/netfilter/xt_string.c
+++ b/net/netfilter/xt_string.c
@@ -21,6 +21,7 @@ MODULE_DESCRIPTION("Xtables: string-based matching");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("ipt_string");
 MODULE_ALIAS("ip6t_string");
+MODULE_ALIAS("ebt_string");
 
 static bool
 string_mt(const struct sk_buff *skb, struct xt_action_param *par)
-- 
cgit v1.2.3


From 9124a20d8794663a396b5d6f91f66903848a042b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 21 Mar 2018 04:03:22 -0700
Subject: netfilter: ebt_stp: Use generic functions for comparisons

Instead of unnecessary const declarations, use the generic functions to
save a little object space.

$ size net/bridge/netfilter/ebt_stp.o*
   text	   data	    bss	    dec	    hex	filename
   1250	    144	      0	   1394	    572	net/bridge/netfilter/ebt_stp.o.new
   1344	    144	      0	   1488	    5d0	net/bridge/netfilter/ebt_stp.o.old

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_stp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 3140eb912d7e..47ba98db145d 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -153,8 +153,6 @@ ebt_stp_mt(const struct sk_buff *skb, struct xt_action_param *par)
 static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
 {
 	const struct ebt_stp_info *info = par->matchinfo;
-	const u8 bridge_ula[6] = {0x01, 0x80, 0xc2, 0x00, 0x00, 0x00};
-	const u8 msk[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 	const struct ebt_entry *e = par->entryinfo;
 
 	if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK ||
@@ -162,8 +160,8 @@ static int ebt_stp_mt_check(const struct xt_mtchk_param *par)
 		return -EINVAL;
 	/* Make sure the match only receives stp frames */
 	if (!par->nft_compat &&
-	    (!ether_addr_equal(e->destmac, bridge_ula) ||
-	     !ether_addr_equal(e->destmsk, msk) ||
+	    (!ether_addr_equal(e->destmac, eth_stp_addr) ||
+	     !is_broadcast_ether_addr(e->destmsk) ||
 	     !(e->bitmask & EBT_DESTMAC)))
 		return -EINVAL;
 
-- 
cgit v1.2.3


From 32537e91847a5686d57d3811c075a46b2d9b6434 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 27 Mar 2018 11:53:05 +0200
Subject: netfilter: nf_tables: rename struct nf_chain_type

Use nft_ prefix. By when I added chain types, I forgot to use the
nftables prefix. Rename enum nft_chain_type to enum nft_chain_types too,
otherwise there is an overlap.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nf_tables_bridge.c   |  2 +-
 net/ipv4/netfilter/nf_tables_arp.c        |  2 +-
 net/ipv4/netfilter/nf_tables_ipv4.c       |  2 +-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c   |  2 +-
 net/ipv4/netfilter/nft_chain_route_ipv4.c |  2 +-
 net/ipv6/netfilter/nf_tables_ipv6.c       |  2 +-
 net/ipv6/netfilter/nft_chain_nat_ipv6.c   |  2 +-
 net/ipv6/netfilter/nft_chain_route_ipv6.c |  2 +-
 net/netfilter/nf_tables_api.c             | 18 +++++++++---------
 net/netfilter/nf_tables_inet.c            |  2 +-
 net/netfilter/nf_tables_netdev.c          |  2 +-
 11 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 5160cf614176..73a1ec556a0a 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -42,7 +42,7 @@ nft_do_chain_bridge(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static const struct nf_chain_type filter_bridge = {
+static const struct nft_chain_type filter_bridge = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_BRIDGE,
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 036c074736b0..5b0be2a10b69 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -27,7 +27,7 @@ nft_do_chain_arp(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static const struct nf_chain_type filter_arp = {
+static const struct nft_chain_type filter_arp = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_ARP,
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 96f955496d5f..13bae5cfa257 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -30,7 +30,7 @@ static unsigned int nft_do_chain_ipv4(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static const struct nf_chain_type filter_ipv4 = {
+static const struct nft_chain_type filter_ipv4 = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_IPV4,
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index f2a490981594..167f377eb1cb 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -67,7 +67,7 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv,
 	return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
-static const struct nf_chain_type nft_chain_nat_ipv4 = {
+static const struct nft_chain_type nft_chain_nat_ipv4 = {
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
 	.family		= NFPROTO_IPV4,
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index d965c225b9f6..48cf1f892314 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -58,7 +58,7 @@ static unsigned int nf_route_table_hook(void *priv,
 	return ret;
 }
 
-static const struct nf_chain_type nft_chain_route_ipv4 = {
+static const struct nft_chain_type nft_chain_route_ipv4 = {
 	.name		= "route",
 	.type		= NFT_CHAIN_T_ROUTE,
 	.family		= NFPROTO_IPV4,
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 17e03589331c..d99f9ac6f1b6 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -28,7 +28,7 @@ static unsigned int nft_do_chain_ipv6(void *priv,
 	return nft_do_chain(&pkt, priv);
 }
 
-static const struct nf_chain_type filter_ipv6 = {
+static const struct nft_chain_type filter_ipv6 = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_IPV6,
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index 73fe2bd13fcf..c498aaa8056b 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -65,7 +65,7 @@ static unsigned int nft_nat_ipv6_local_fn(void *priv,
 	return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
-static const struct nf_chain_type nft_chain_nat_ipv6 = {
+static const struct nft_chain_type nft_chain_nat_ipv6 = {
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
 	.family		= NFPROTO_IPV6,
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 11d3c3b9aa18..d5c7fdc34256 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -60,7 +60,7 @@ static unsigned int nf_route_table_hook(void *priv,
 	return ret;
 }
 
-static const struct nf_chain_type nft_chain_route_ipv6 = {
+static const struct nft_chain_type nft_chain_route_ipv6 = {
 	.name		= "route",
 	.type		= NFT_CHAIN_T_ROUTE,
 	.family		= NFPROTO_IPV6,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 92f5606b0dea..bf564f491085 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -384,9 +384,9 @@ static inline u64 nf_tables_alloc_handle(struct nft_table *table)
 	return ++table->hgenerator;
 }
 
-static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];
+static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];
 
-static const struct nf_chain_type *
+static const struct nft_chain_type *
 __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
 {
 	int i;
@@ -399,10 +399,10 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
 	return NULL;
 }
 
-static const struct nf_chain_type *
+static const struct nft_chain_type *
 nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload)
 {
-	const struct nf_chain_type *type;
+	const struct nft_chain_type *type;
 
 	type = __nf_tables_chain_type_lookup(nla, family);
 	if (type != NULL)
@@ -859,7 +859,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 	kfree(ctx->table);
 }
 
-int nft_register_chain_type(const struct nf_chain_type *ctype)
+int nft_register_chain_type(const struct nft_chain_type *ctype)
 {
 	int err = 0;
 
@@ -878,7 +878,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(nft_register_chain_type);
 
-void nft_unregister_chain_type(const struct nf_chain_type *ctype)
+void nft_unregister_chain_type(const struct nft_chain_type *ctype)
 {
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
 	chain_type[ctype->family][ctype->type] = NULL;
@@ -1239,7 +1239,7 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
 struct nft_chain_hook {
 	u32				num;
 	s32				priority;
-	const struct nf_chain_type	*type;
+	const struct nft_chain_type	*type;
 	struct net_device		*dev;
 };
 
@@ -1249,7 +1249,7 @@ static int nft_chain_parse_hook(struct net *net,
 				bool create)
 {
 	struct nlattr *ha[NFTA_HOOK_MAX + 1];
-	const struct nf_chain_type *type;
+	const struct nft_chain_type *type;
 	struct net_device *dev;
 	int err;
 
@@ -6000,7 +6000,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
 };
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
-				  enum nft_chain_type type)
+				  enum nft_chain_types type)
 {
 	const struct nft_base_chain *basechain;
 
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index e30c7da09d0d..0aefe66ce558 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -38,7 +38,7 @@ static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
 	return nft_do_chain(&pkt, priv);
 }
 
-static const struct nf_chain_type filter_inet = {
+static const struct nft_chain_type filter_inet = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_INET,
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 4041fafca934..88ea959211ac 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -38,7 +38,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
 	return nft_do_chain(&pkt, priv);
 }
 
-static const struct nf_chain_type nft_filter_chain_netdev = {
+static const struct nft_chain_type nft_filter_chain_netdev = {
 	.name		= "filter",
 	.type		= NFT_CHAIN_T_DEFAULT,
 	.family		= NFPROTO_NETDEV,
-- 
cgit v1.2.3


From cc07eeb0e5ee18895241460bdccf91a4952731f9 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 27 Mar 2018 11:53:06 +0200
Subject: netfilter: nf_tables: nft_register_chain_type() returns void

Use WARN_ON() instead since it should not happen that neither family
goes over NFPROTO_NUMPROTO nor there is already a chain of this type
already registered.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/nf_tables_bridge.c   |  4 +++-
 net/ipv4/netfilter/nf_tables_arp.c        |  4 +++-
 net/ipv4/netfilter/nf_tables_ipv4.c       |  4 +++-
 net/ipv4/netfilter/nft_chain_nat_ipv4.c   |  6 +-----
 net/ipv4/netfilter/nft_chain_route_ipv4.c |  4 +++-
 net/ipv6/netfilter/nf_tables_ipv6.c       |  4 +++-
 net/ipv6/netfilter/nft_chain_nat_ipv6.c   |  6 +-----
 net/ipv6/netfilter/nft_chain_route_ipv6.c |  4 +++-
 net/netfilter/nf_tables_api.c             | 14 +++++---------
 net/netfilter/nf_tables_inet.c            |  4 +++-
 net/netfilter/nf_tables_netdev.c          |  4 +---
 11 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 73a1ec556a0a..ffb8580dfdac 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -63,7 +63,9 @@ static const struct nft_chain_type filter_bridge = {
 
 static int __init nf_tables_bridge_init(void)
 {
-	return nft_register_chain_type(&filter_bridge);
+	nft_register_chain_type(&filter_bridge);
+
+	return 0;
 }
 
 static void __exit nf_tables_bridge_exit(void)
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 5b0be2a10b69..c2ee64208743 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -42,7 +42,9 @@ static const struct nft_chain_type filter_arp = {
 
 static int __init nf_tables_arp_init(void)
 {
-	return nft_register_chain_type(&filter_arp);
+	nft_register_chain_type(&filter_arp);
+
+	return 0;
 }
 
 static void __exit nf_tables_arp_exit(void)
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index 13bae5cfa257..c09667de0d68 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -51,7 +51,9 @@ static const struct nft_chain_type filter_ipv4 = {
 
 static int __init nf_tables_ipv4_init(void)
 {
-	return nft_register_chain_type(&filter_ipv4);
+	nft_register_chain_type(&filter_ipv4);
+
+	return 0;
 }
 
 static void __exit nf_tables_ipv4_exit(void)
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index 167f377eb1cb..9864f5b3279c 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -86,11 +86,7 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
 
 static int __init nft_chain_nat_init(void)
 {
-	int err;
-
-	err = nft_register_chain_type(&nft_chain_nat_ipv4);
-	if (err < 0)
-		return err;
+	nft_register_chain_type(&nft_chain_nat_ipv4);
 
 	return 0;
 }
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 48cf1f892314..7d82934c46f4 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -71,7 +71,9 @@ static const struct nft_chain_type nft_chain_route_ipv4 = {
 
 static int __init nft_chain_route_init(void)
 {
-	return nft_register_chain_type(&nft_chain_route_ipv4);
+	nft_register_chain_type(&nft_chain_route_ipv4);
+
+	return 0;
 }
 
 static void __exit nft_chain_route_exit(void)
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index d99f9ac6f1b6..496f69453457 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -49,7 +49,9 @@ static const struct nft_chain_type filter_ipv6 = {
 
 static int __init nf_tables_ipv6_init(void)
 {
-	return nft_register_chain_type(&filter_ipv6);
+	nft_register_chain_type(&filter_ipv6);
+
+	return 0;
 }
 
 static void __exit nf_tables_ipv6_exit(void)
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index c498aaa8056b..c95d9a97d425 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -84,11 +84,7 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
 
 static int __init nft_chain_nat_ipv6_init(void)
 {
-	int err;
-
-	err = nft_register_chain_type(&nft_chain_nat_ipv6);
-	if (err < 0)
-		return err;
+	nft_register_chain_type(&nft_chain_nat_ipv6);
 
 	return 0;
 }
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index d5c7fdc34256..da3f1f8cb325 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -73,7 +73,9 @@ static const struct nft_chain_type nft_chain_route_ipv6 = {
 
 static int __init nft_chain_route_init(void)
 {
-	return nft_register_chain_type(&nft_chain_route_ipv6);
+	nft_register_chain_type(&nft_chain_route_ipv6);
+
+	return 0;
 }
 
 static void __exit nft_chain_route_exit(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index bf564f491085..9e4b1614ee39 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -859,22 +859,18 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx)
 	kfree(ctx->table);
 }
 
-int nft_register_chain_type(const struct nft_chain_type *ctype)
+void nft_register_chain_type(const struct nft_chain_type *ctype)
 {
-	int err = 0;
-
 	if (WARN_ON(ctype->family >= NFPROTO_NUMPROTO))
-		return -EINVAL;
+		return;
 
 	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	if (chain_type[ctype->family][ctype->type] != NULL) {
-		err = -EBUSY;
-		goto out;
+	if (WARN_ON(chain_type[ctype->family][ctype->type] != NULL)) {
+		nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+		return;
 	}
 	chain_type[ctype->family][ctype->type] = ctype;
-out:
 	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-	return err;
 }
 EXPORT_SYMBOL_GPL(nft_register_chain_type);
 
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 0aefe66ce558..202c4219969b 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -59,7 +59,9 @@ static const struct nft_chain_type filter_inet = {
 
 static int __init nf_tables_inet_init(void)
 {
-	return nft_register_chain_type(&filter_inet);
+	nft_register_chain_type(&filter_inet);
+
+	return 0;
 }
 
 static void __exit nf_tables_inet_exit(void)
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 88ea959211ac..4c3835bca63e 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -112,9 +112,7 @@ static int __init nf_tables_netdev_init(void)
 {
 	int ret;
 
-	ret = nft_register_chain_type(&nft_filter_chain_netdev);
-	if (ret)
-		return ret;
+	nft_register_chain_type(&nft_filter_chain_netdev);
 
 	ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
 	if (ret)
-- 
cgit v1.2.3


From 02c7b25e5f54321b9063e18d4f52cce07f8e081d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 27 Mar 2018 11:53:07 +0200
Subject: netfilter: nf_tables: build-in filter chain type

One module per supported filter chain family type takes too much memory
for very little code - too much modularization - place all chain filter
definitions in one single file.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/Kconfig            |   2 +-
 net/bridge/netfilter/Makefile           |   1 -
 net/bridge/netfilter/nf_tables_bridge.c |  81 -------
 net/ipv4/netfilter/Kconfig              |   4 +-
 net/ipv4/netfilter/Makefile             |   2 -
 net/ipv4/netfilter/nf_tables_arp.c      |  60 -----
 net/ipv4/netfilter/nf_tables_ipv4.c     |  69 ------
 net/ipv6/netfilter/Kconfig              |   2 +-
 net/ipv6/netfilter/Makefile             |   1 -
 net/ipv6/netfilter/nf_tables_ipv6.c     |  67 ------
 net/netfilter/Kconfig                   |   4 +-
 net/netfilter/Makefile                  |   9 +-
 net/netfilter/nf_tables_api.c           |   3 +
 net/netfilter/nf_tables_inet.c          |  77 ------
 net/netfilter/nf_tables_netdev.c        | 140 -----------
 net/netfilter/nft_chain_filter.c        | 398 ++++++++++++++++++++++++++++++++
 16 files changed, 411 insertions(+), 509 deletions(-)
 delete mode 100644 net/bridge/netfilter/nf_tables_bridge.c
 delete mode 100644 net/ipv4/netfilter/nf_tables_arp.c
 delete mode 100644 net/ipv4/netfilter/nf_tables_ipv4.c
 delete mode 100644 net/ipv6/netfilter/nf_tables_ipv6.c
 delete mode 100644 net/netfilter/nf_tables_inet.c
 delete mode 100644 net/netfilter/nf_tables_netdev.c
 create mode 100644 net/netfilter/nft_chain_filter.c

(limited to 'net')

diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 225d1668dfdd..f212447794bd 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -5,7 +5,7 @@
 menuconfig NF_TABLES_BRIDGE
 	depends on BRIDGE && NETFILTER && NF_TABLES
 	select NETFILTER_FAMILY_BRIDGE
-	tristate "Ethernet Bridge nf_tables support"
+	bool "Ethernet Bridge nf_tables support"
 
 if NF_TABLES_BRIDGE
 
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 2f28e16de6c7..4bc758dd4a8c 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,7 +3,6 @@
 # Makefile for the netfilter modules for Link Layer filtering on a bridge.
 #
 
-obj-$(CONFIG_NF_TABLES_BRIDGE) += nf_tables_bridge.o
 obj-$(CONFIG_NFT_BRIDGE_META)  += nft_meta_bridge.o
 obj-$(CONFIG_NFT_BRIDGE_REJECT)  += nft_reject_bridge.o
 
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
deleted file mode 100644
index ffb8580dfdac..000000000000
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netfilter_bridge.h>
-#include <net/netfilter/nf_tables.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/netfilter/nf_tables_ipv6.h>
-
-static unsigned int
-nft_do_chain_bridge(void *priv,
-		    struct sk_buff *skb,
-		    const struct nf_hook_state *state)
-{
-	struct nft_pktinfo pkt;
-
-	nft_set_pktinfo(&pkt, skb, state);
-
-	switch (eth_hdr(skb)->h_proto) {
-	case htons(ETH_P_IP):
-		nft_set_pktinfo_ipv4_validate(&pkt, skb);
-		break;
-	case htons(ETH_P_IPV6):
-		nft_set_pktinfo_ipv6_validate(&pkt, skb);
-		break;
-	default:
-		nft_set_pktinfo_unspec(&pkt, skb);
-		break;
-	}
-
-	return nft_do_chain(&pkt, priv);
-}
-
-static const struct nft_chain_type filter_bridge = {
-	.name		= "filter",
-	.type		= NFT_CHAIN_T_DEFAULT,
-	.family		= NFPROTO_BRIDGE,
-	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_BR_PRE_ROUTING) |
-			  (1 << NF_BR_LOCAL_IN) |
-			  (1 << NF_BR_FORWARD) |
-			  (1 << NF_BR_LOCAL_OUT) |
-			  (1 << NF_BR_POST_ROUTING),
-	.hooks		= {
-		[NF_BR_PRE_ROUTING]	= nft_do_chain_bridge,
-		[NF_BR_LOCAL_IN]	= nft_do_chain_bridge,
-		[NF_BR_FORWARD]		= nft_do_chain_bridge,
-		[NF_BR_LOCAL_OUT]	= nft_do_chain_bridge,
-		[NF_BR_POST_ROUTING]	= nft_do_chain_bridge,
-	},
-};
-
-static int __init nf_tables_bridge_init(void)
-{
-	nft_register_chain_type(&filter_bridge);
-
-	return 0;
-}
-
-static void __exit nf_tables_bridge_exit(void)
-{
-	nft_unregister_chain_type(&filter_bridge);
-}
-
-module_init(nf_tables_bridge_init);
-module_exit(nf_tables_bridge_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_BRIDGE, "filter");
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index dfe6fa4ea554..280048e1e395 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -34,7 +34,7 @@ config NF_SOCKET_IPV4
 if NF_TABLES
 
 config NF_TABLES_IPV4
-	tristate "IPv4 nf_tables support"
+	bool "IPv4 nf_tables support"
 	help
 	  This option enables the IPv4 support for nf_tables.
 
@@ -71,7 +71,7 @@ config NFT_FIB_IPV4
 endif # NF_TABLES_IPV4
 
 config NF_TABLES_ARP
-	tristate "ARP nf_tables support"
+	bool "ARP nf_tables support"
 	select NETFILTER_FAMILY_ARP
 	help
 	  This option enables the ARP support for nf_tables.
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 2dad20eefd26..62ede5e3a3de 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -39,7 +39,6 @@ obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
 
-obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
 obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
 obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
@@ -47,7 +46,6 @@ obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
 obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
 obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
 obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
-obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
 
 # flow table support
 obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
deleted file mode 100644
index c2ee64208743..000000000000
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2008-2010 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2013 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/netfilter_arp.h>
-#include <net/netfilter/nf_tables.h>
-
-static unsigned int
-nft_do_chain_arp(void *priv,
-		  struct sk_buff *skb,
-		  const struct nf_hook_state *state)
-{
-	struct nft_pktinfo pkt;
-
-	nft_set_pktinfo(&pkt, skb, state);
-	nft_set_pktinfo_unspec(&pkt, skb);
-
-	return nft_do_chain(&pkt, priv);
-}
-
-static const struct nft_chain_type filter_arp = {
-	.name		= "filter",
-	.type		= NFT_CHAIN_T_DEFAULT,
-	.family		= NFPROTO_ARP,
-	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_ARP_IN) |
-			  (1 << NF_ARP_OUT),
-	.hooks		= {
-		[NF_ARP_IN]		= nft_do_chain_arp,
-		[NF_ARP_OUT]		= nft_do_chain_arp,
-	},
-};
-
-static int __init nf_tables_arp_init(void)
-{
-	nft_register_chain_type(&filter_arp);
-
-	return 0;
-}
-
-static void __exit nf_tables_arp_exit(void)
-{
-	nft_unregister_chain_type(&filter_arp);
-}
-
-module_init(nf_tables_arp_init);
-module_exit(nf_tables_arp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(3, "filter"); /* NFPROTO_ARP */
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
deleted file mode 100644
index c09667de0d68..000000000000
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/ip.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/net_namespace.h>
-#include <net/ip.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-
-static unsigned int nft_do_chain_ipv4(void *priv,
-				      struct sk_buff *skb,
-				      const struct nf_hook_state *state)
-{
-	struct nft_pktinfo pkt;
-
-	nft_set_pktinfo(&pkt, skb, state);
-	nft_set_pktinfo_ipv4(&pkt, skb);
-
-	return nft_do_chain(&pkt, priv);
-}
-
-static const struct nft_chain_type filter_ipv4 = {
-	.name		= "filter",
-	.type		= NFT_CHAIN_T_DEFAULT,
-	.family		= NFPROTO_IPV4,
-	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
-			  (1 << NF_INET_LOCAL_OUT) |
-			  (1 << NF_INET_FORWARD) |
-			  (1 << NF_INET_PRE_ROUTING) |
-			  (1 << NF_INET_POST_ROUTING),
-	.hooks		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain_ipv4,
-		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
-		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
-	},
-};
-
-static int __init nf_tables_ipv4_init(void)
-{
-	nft_register_chain_type(&filter_ipv4);
-
-	return 0;
-}
-
-static void __exit nf_tables_ipv4_exit(void)
-{
-	nft_unregister_chain_type(&filter_ipv4);
-}
-
-module_init(nf_tables_ipv4_init);
-module_exit(nf_tables_ipv4_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "filter");
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index d395d1590699..ccbfa83e4bb0 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -34,7 +34,7 @@ config NF_SOCKET_IPV6
 if NF_TABLES
 
 config NF_TABLES_IPV6
-	tristate "IPv6 nf_tables support"
+	bool "IPv6 nf_tables support"
 	help
 	  This option enables the IPv6 support for nf_tables.
 
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index d984057b8395..44273d6f03a5 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -36,7 +36,6 @@ obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
 obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o
 
 # nf_tables
-obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o
 obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
 obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
 obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
deleted file mode 100644
index 496f69453457..000000000000
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/ipv6.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv6.h>
-
-static unsigned int nft_do_chain_ipv6(void *priv,
-				      struct sk_buff *skb,
-				      const struct nf_hook_state *state)
-{
-	struct nft_pktinfo pkt;
-
-	nft_set_pktinfo(&pkt, skb, state);
-	nft_set_pktinfo_ipv6(&pkt, skb);
-
-	return nft_do_chain(&pkt, priv);
-}
-
-static const struct nft_chain_type filter_ipv6 = {
-	.name		= "filter",
-	.type		= NFT_CHAIN_T_DEFAULT,
-	.family		= NFPROTO_IPV6,
-	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
-			  (1 << NF_INET_LOCAL_OUT) |
-			  (1 << NF_INET_FORWARD) |
-			  (1 << NF_INET_PRE_ROUTING) |
-			  (1 << NF_INET_POST_ROUTING),
-	.hooks		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain_ipv6,
-		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
-		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
-	},
-};
-
-static int __init nf_tables_ipv6_init(void)
-{
-	nft_register_chain_type(&filter_ipv6);
-
-	return 0;
-}
-
-static void __exit nf_tables_ipv6_exit(void)
-{
-	nft_unregister_chain_type(&filter_ipv6);
-}
-
-module_init(nf_tables_ipv6_init);
-module_exit(nf_tables_ipv6_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET6, "filter");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index d3220b43c832..704b3832dbad 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -465,12 +465,12 @@ config NF_TABLES_INET
 	depends on IPV6
 	select NF_TABLES_IPV4
 	select NF_TABLES_IPV6
-	tristate "Netfilter nf_tables mixed IPv4/IPv6 tables support"
+	bool "Netfilter nf_tables mixed IPv4/IPv6 tables support"
 	help
 	  This option enables support for a mixed IPv4/IPv6 "inet" table.
 
 config NF_TABLES_NETDEV
-	tristate "Netfilter nf_tables netdev tables support"
+	bool "Netfilter nf_tables netdev tables support"
 	help
 	  This option enables support for the "netdev" table.
 
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5d9b8b959e58..fd32bd2c9521 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -73,13 +73,12 @@ obj-$(CONFIG_NETFILTER_CONNCOUNT) += nf_conncount.o
 obj-$(CONFIG_NF_DUP_NETDEV)	+= nf_dup_netdev.o
 
 # nf_tables
-nf_tables-objs := nf_tables_core.o nf_tables_api.o nf_tables_trace.o \
-		  nft_immediate.o nft_cmp.o nft_range.o nft_bitwise.o \
-		  nft_byteorder.o nft_payload.o nft_lookup.o nft_dynset.o
+nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
+		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
+		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
+		  nft_dynset.o
 
 obj-$(CONFIG_NF_TABLES)		+= nf_tables.o
-obj-$(CONFIG_NF_TABLES_INET)	+= nf_tables_inet.o
-obj-$(CONFIG_NF_TABLES_NETDEV)	+= nf_tables_netdev.o
 obj-$(CONFIG_NFT_COMPAT)	+= nft_compat.o
 obj-$(CONFIG_NFT_EXTHDR)	+= nft_exthdr.o
 obj-$(CONFIG_NFT_META)		+= nft_meta.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9e4b1614ee39..97ec1c388bfe 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6584,6 +6584,8 @@ static int __init nf_tables_module_init(void)
 {
 	int err;
 
+	nft_chain_filter_init();
+
 	info = kmalloc(sizeof(struct nft_expr_info) * NFT_RULE_MAXEXPRS,
 		       GFP_KERNEL);
 	if (info == NULL) {
@@ -6618,6 +6620,7 @@ static void __exit nf_tables_module_exit(void)
 	rcu_barrier();
 	nf_tables_core_module_exit();
 	kfree(info);
+	nft_chain_filter_fini();
 }
 
 module_init(nf_tables_module_init);
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
deleted file mode 100644
index 202c4219969b..000000000000
--- a/net/netfilter/nf_tables_inet.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2012-2014 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/netfilter/nf_tables_ipv6.h>
-#include <net/ip.h>
-
-static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
-				      const struct nf_hook_state *state)
-{
-	struct nft_pktinfo pkt;
-
-	nft_set_pktinfo(&pkt, skb, state);
-
-	switch (state->pf) {
-	case NFPROTO_IPV4:
-		nft_set_pktinfo_ipv4(&pkt, skb);
-		break;
-	case NFPROTO_IPV6:
-		nft_set_pktinfo_ipv6(&pkt, skb);
-		break;
-	default:
-		break;
-	}
-
-	return nft_do_chain(&pkt, priv);
-}
-
-static const struct nft_chain_type filter_inet = {
-	.name		= "filter",
-	.type		= NFT_CHAIN_T_DEFAULT,
-	.family		= NFPROTO_INET,
-	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
-			  (1 << NF_INET_LOCAL_OUT) |
-			  (1 << NF_INET_FORWARD) |
-			  (1 << NF_INET_PRE_ROUTING) |
-			  (1 << NF_INET_POST_ROUTING),
-	.hooks		= {
-		[NF_INET_LOCAL_IN]	= nft_do_chain_inet,
-		[NF_INET_LOCAL_OUT]	= nft_do_chain_inet,
-		[NF_INET_FORWARD]	= nft_do_chain_inet,
-		[NF_INET_PRE_ROUTING]	= nft_do_chain_inet,
-		[NF_INET_POST_ROUTING]	= nft_do_chain_inet,
-        },
-};
-
-static int __init nf_tables_inet_init(void)
-{
-	nft_register_chain_type(&filter_inet);
-
-	return 0;
-}
-
-static void __exit nf_tables_inet_exit(void)
-{
-	nft_unregister_chain_type(&filter_inet);
-}
-
-module_init(nf_tables_inet_init);
-module_exit(nf_tables_inet_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(1, "filter");
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
deleted file mode 100644
index 4c3835bca63e..000000000000
--- a/net/netfilter/nf_tables_netdev.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <net/netfilter/nf_tables.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/netfilter/nf_tables_ipv6.h>
-
-static unsigned int
-nft_do_chain_netdev(void *priv, struct sk_buff *skb,
-		    const struct nf_hook_state *state)
-{
-	struct nft_pktinfo pkt;
-
-	nft_set_pktinfo(&pkt, skb, state);
-
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		nft_set_pktinfo_ipv4_validate(&pkt, skb);
-		break;
-	case htons(ETH_P_IPV6):
-		nft_set_pktinfo_ipv6_validate(&pkt, skb);
-		break;
-	default:
-		nft_set_pktinfo_unspec(&pkt, skb);
-		break;
-	}
-
-	return nft_do_chain(&pkt, priv);
-}
-
-static const struct nft_chain_type nft_filter_chain_netdev = {
-	.name		= "filter",
-	.type		= NFT_CHAIN_T_DEFAULT,
-	.family		= NFPROTO_NETDEV,
-	.owner		= THIS_MODULE,
-	.hook_mask	= (1 << NF_NETDEV_INGRESS),
-	.hooks		= {
-		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
-	},
-};
-
-static void nft_netdev_event(unsigned long event, struct net_device *dev,
-			     struct nft_ctx *ctx)
-{
-	struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
-
-	switch (event) {
-	case NETDEV_UNREGISTER:
-		if (strcmp(basechain->dev_name, dev->name) != 0)
-			return;
-
-		__nft_release_basechain(ctx);
-		break;
-	case NETDEV_CHANGENAME:
-		if (dev->ifindex != basechain->ops.dev->ifindex)
-			return;
-
-		strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
-		break;
-	}
-}
-
-static int nf_tables_netdev_event(struct notifier_block *this,
-				  unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-	struct nft_table *table;
-	struct nft_chain *chain, *nr;
-	struct nft_ctx ctx = {
-		.net	= dev_net(dev),
-	};
-
-	if (event != NETDEV_UNREGISTER &&
-	    event != NETDEV_CHANGENAME)
-		return NOTIFY_DONE;
-
-	nfnl_lock(NFNL_SUBSYS_NFTABLES);
-	list_for_each_entry(table, &ctx.net->nft.tables, list) {
-		if (table->family != NFPROTO_NETDEV)
-			continue;
-
-		ctx.family = table->family;
-		ctx.table = table;
-		list_for_each_entry_safe(chain, nr, &table->chains, list) {
-			if (!nft_is_base_chain(chain))
-				continue;
-
-			ctx.chain = chain;
-			nft_netdev_event(event, dev, &ctx);
-		}
-	}
-	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block nf_tables_netdev_notifier = {
-	.notifier_call	= nf_tables_netdev_event,
-};
-
-static int __init nf_tables_netdev_init(void)
-{
-	int ret;
-
-	nft_register_chain_type(&nft_filter_chain_netdev);
-
-	ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
-	if (ret)
-		goto err_register_netdevice_notifier;
-
-	return 0;
-
-err_register_netdevice_notifier:
-	nft_unregister_chain_type(&nft_filter_chain_netdev);
-
-	return ret;
-}
-
-static void __exit nf_tables_netdev_exit(void)
-{
-	unregister_netdevice_notifier(&nf_tables_netdev_notifier);
-	nft_unregister_chain_type(&nft_filter_chain_netdev);
-}
-
-module_init(nf_tables_netdev_init);
-module_exit(nf_tables_netdev_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_CHAIN(5, "filter"); /* NFPROTO_NETDEV */
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
new file mode 100644
index 000000000000..84c902477a91
--- /dev/null
+++ b/net/netfilter/nft_chain_filter.c
@@ -0,0 +1,398 @@
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/netfilter/nf_tables.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_arp.h>
+#include <net/netfilter/nf_tables_ipv4.h>
+#include <net/netfilter/nf_tables_ipv6.h>
+
+#ifdef CONFIG_NF_TABLES_IPV4
+static unsigned int nft_do_chain_ipv4(void *priv,
+				      struct sk_buff *skb,
+				      const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv4(&pkt, skb);
+
+	return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_filter_ipv4 = {
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.family		= NFPROTO_IPV4,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.hooks		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv4,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain_ipv4,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv4,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv4,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv4,
+	},
+};
+
+static void nft_chain_filter_ipv4_init(void)
+{
+	nft_register_chain_type(&nft_chain_filter_ipv4);
+}
+static void nft_chain_filter_ipv4_fini(void)
+{
+	nft_unregister_chain_type(&nft_chain_filter_ipv4);
+}
+
+#else
+static inline void nft_chain_filter_ipv4_init(void) {}
+static inline void nft_chain_filter_ipv4_fini(void) {}
+#endif /* CONFIG_NF_TABLES_IPV4 */
+
+#ifdef CONFIG_NF_TABLES_ARP
+static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb,
+				     const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_unspec(&pkt, skb);
+
+	return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_filter_arp = {
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.family		= NFPROTO_ARP,
+	.owner		= THIS_MODULE,
+	.hook_mask	= (1 << NF_ARP_IN) |
+			  (1 << NF_ARP_OUT),
+	.hooks		= {
+		[NF_ARP_IN]		= nft_do_chain_arp,
+		[NF_ARP_OUT]		= nft_do_chain_arp,
+	},
+};
+
+static void nft_chain_filter_arp_init(void)
+{
+	nft_register_chain_type(&nft_chain_filter_arp);
+}
+
+static void nft_chain_filter_arp_fini(void)
+{
+	nft_unregister_chain_type(&nft_chain_filter_arp);
+}
+#else
+static inline void nft_chain_filter_arp_init(void) {}
+static inline void nft_chain_filter_arp_fini(void) {}
+#endif /* CONFIG_NF_TABLES_ARP */
+
+#ifdef CONFIG_NF_TABLES_IPV6
+static unsigned int nft_do_chain_ipv6(void *priv,
+				      struct sk_buff *skb,
+				      const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+	nft_set_pktinfo_ipv6(&pkt, skb);
+
+	return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_filter_ipv6 = {
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.family		= NFPROTO_IPV6,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.hooks		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain_ipv6,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain_ipv6,
+		[NF_INET_FORWARD]	= nft_do_chain_ipv6,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_ipv6,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_ipv6,
+	},
+};
+
+static void nft_chain_filter_ipv6_init(void)
+{
+	nft_register_chain_type(&nft_chain_filter_ipv6);
+}
+
+static void nft_chain_filter_ipv6_fini(void)
+{
+	nft_unregister_chain_type(&nft_chain_filter_ipv6);
+}
+#else
+static inline void nft_chain_filter_ipv6_init(void) {}
+static inline void nft_chain_filter_ipv6_fini(void) {}
+#endif /* CONFIG_NF_TABLES_IPV6 */
+
+#ifdef CONFIG_NF_TABLES_INET
+static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
+				      const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+
+	switch (state->pf) {
+	case NFPROTO_IPV4:
+		nft_set_pktinfo_ipv4(&pkt, skb);
+		break;
+	case NFPROTO_IPV6:
+		nft_set_pktinfo_ipv6(&pkt, skb);
+		break;
+	default:
+		break;
+	}
+
+	return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_filter_inet = {
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.family		= NFPROTO_INET,
+	.hook_mask	= (1 << NF_INET_LOCAL_IN) |
+			  (1 << NF_INET_LOCAL_OUT) |
+			  (1 << NF_INET_FORWARD) |
+			  (1 << NF_INET_PRE_ROUTING) |
+			  (1 << NF_INET_POST_ROUTING),
+	.hooks		= {
+		[NF_INET_LOCAL_IN]	= nft_do_chain_inet,
+		[NF_INET_LOCAL_OUT]	= nft_do_chain_inet,
+		[NF_INET_FORWARD]	= nft_do_chain_inet,
+		[NF_INET_PRE_ROUTING]	= nft_do_chain_inet,
+		[NF_INET_POST_ROUTING]	= nft_do_chain_inet,
+        },
+};
+
+static void nft_chain_filter_inet_init(void)
+{
+	nft_register_chain_type(&nft_chain_filter_inet);
+}
+
+static void nft_chain_filter_inet_fini(void)
+{
+	nft_unregister_chain_type(&nft_chain_filter_inet);
+}
+#else
+static inline void nft_chain_filter_inet_init(void) {}
+static inline void nft_chain_filter_inet_fini(void) {}
+#endif /* CONFIG_NF_TABLES_IPV6 */
+
+#ifdef CONFIG_NF_TABLES_BRIDGE
+static unsigned int
+nft_do_chain_bridge(void *priv,
+		    struct sk_buff *skb,
+		    const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+
+	switch (eth_hdr(skb)->h_proto) {
+	case htons(ETH_P_IP):
+		nft_set_pktinfo_ipv4_validate(&pkt, skb);
+		break;
+	case htons(ETH_P_IPV6):
+		nft_set_pktinfo_ipv6_validate(&pkt, skb);
+		break;
+	default:
+		nft_set_pktinfo_unspec(&pkt, skb);
+		break;
+	}
+
+	return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_filter_bridge = {
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.family		= NFPROTO_BRIDGE,
+	.hook_mask	= (1 << NF_BR_PRE_ROUTING) |
+			  (1 << NF_BR_LOCAL_IN) |
+			  (1 << NF_BR_FORWARD) |
+			  (1 << NF_BR_LOCAL_OUT) |
+			  (1 << NF_BR_POST_ROUTING),
+	.hooks		= {
+		[NF_BR_PRE_ROUTING]	= nft_do_chain_bridge,
+		[NF_BR_LOCAL_IN]	= nft_do_chain_bridge,
+		[NF_BR_FORWARD]		= nft_do_chain_bridge,
+		[NF_BR_LOCAL_OUT]	= nft_do_chain_bridge,
+		[NF_BR_POST_ROUTING]	= nft_do_chain_bridge,
+	},
+};
+
+static void nft_chain_filter_bridge_init(void)
+{
+	nft_register_chain_type(&nft_chain_filter_bridge);
+}
+
+static void nft_chain_filter_bridge_fini(void)
+{
+	nft_unregister_chain_type(&nft_chain_filter_bridge);
+}
+#else
+static inline void nft_chain_filter_bridge_init(void) {}
+static inline void nft_chain_filter_bridge_fini(void) {}
+#endif /* CONFIG_NF_TABLES_BRIDGE */
+
+#ifdef CONFIG_NF_TABLES_NETDEV
+static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb,
+					const struct nf_hook_state *state)
+{
+	struct nft_pktinfo pkt;
+
+	nft_set_pktinfo(&pkt, skb, state);
+
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		nft_set_pktinfo_ipv4_validate(&pkt, skb);
+		break;
+	case htons(ETH_P_IPV6):
+		nft_set_pktinfo_ipv6_validate(&pkt, skb);
+		break;
+	default:
+		nft_set_pktinfo_unspec(&pkt, skb);
+		break;
+	}
+
+	return nft_do_chain(&pkt, priv);
+}
+
+static const struct nft_chain_type nft_chain_filter_netdev = {
+	.name		= "filter",
+	.type		= NFT_CHAIN_T_DEFAULT,
+	.family		= NFPROTO_NETDEV,
+	.hook_mask	= (1 << NF_NETDEV_INGRESS),
+	.hooks		= {
+		[NF_NETDEV_INGRESS]	= nft_do_chain_netdev,
+	},
+};
+
+static void nft_netdev_event(unsigned long event, struct net_device *dev,
+			     struct nft_ctx *ctx)
+{
+	struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		if (strcmp(basechain->dev_name, dev->name) != 0)
+			return;
+
+		__nft_release_basechain(ctx);
+		break;
+	case NETDEV_CHANGENAME:
+		if (dev->ifindex != basechain->ops.dev->ifindex)
+			return;
+
+		strncpy(basechain->dev_name, dev->name, IFNAMSIZ);
+		break;
+	}
+}
+
+static int nf_tables_netdev_event(struct notifier_block *this,
+				  unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct nft_table *table;
+	struct nft_chain *chain, *nr;
+	struct nft_ctx ctx = {
+		.net	= dev_net(dev),
+	};
+
+	if (event != NETDEV_UNREGISTER &&
+	    event != NETDEV_CHANGENAME)
+		return NOTIFY_DONE;
+
+	nfnl_lock(NFNL_SUBSYS_NFTABLES);
+	list_for_each_entry(table, &ctx.net->nft.tables, list) {
+		if (table->family != NFPROTO_NETDEV)
+			continue;
+
+		ctx.family = table->family;
+		ctx.table = table;
+		list_for_each_entry_safe(chain, nr, &table->chains, list) {
+			if (!nft_is_base_chain(chain))
+				continue;
+
+			ctx.chain = chain;
+			nft_netdev_event(event, dev, &ctx);
+		}
+	}
+	nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nf_tables_netdev_notifier = {
+	.notifier_call	= nf_tables_netdev_event,
+};
+
+static int nft_chain_filter_netdev_init(void)
+{
+	int err;
+
+	nft_register_chain_type(&nft_chain_filter_netdev);
+
+	err = register_netdevice_notifier(&nf_tables_netdev_notifier);
+	if (err)
+		goto err_register_netdevice_notifier;
+
+	return 0;
+
+err_register_netdevice_notifier:
+	nft_unregister_chain_type(&nft_chain_filter_netdev);
+
+	return err;
+}
+
+static void nft_chain_filter_netdev_fini(void)
+{
+	nft_unregister_chain_type(&nft_chain_filter_netdev);
+	unregister_netdevice_notifier(&nf_tables_netdev_notifier);
+}
+#else
+static inline int nft_chain_filter_netdev_init(void) { return 0; }
+static inline void nft_chain_filter_netdev_fini(void) {}
+#endif /* CONFIG_NF_TABLES_NETDEV */
+
+int __init nft_chain_filter_init(void)
+{
+	int err;
+
+	err = nft_chain_filter_netdev_init();
+	if (err < 0)
+		return err;
+
+	nft_chain_filter_ipv4_init();
+	nft_chain_filter_ipv6_init();
+	nft_chain_filter_arp_init();
+	nft_chain_filter_inet_init();
+	nft_chain_filter_bridge_init();
+
+	return 0;
+}
+
+void __exit nft_chain_filter_fini(void)
+{
+	nft_chain_filter_bridge_fini();
+	nft_chain_filter_inet_fini();
+	nft_chain_filter_arp_fini();
+	nft_chain_filter_ipv6_fini();
+	nft_chain_filter_ipv4_fini();
+	nft_chain_filter_netdev_fini();
+}
-- 
cgit v1.2.3


From 43a605f2f722b6e08addedae8545b490fca252c4 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 27 Mar 2018 11:53:08 +0200
Subject: netfilter: nf_tables: enable conntrack if NAT chain is registered

Register conntrack hooks if the user adds NAT chains. Users get confused
with the existing behaviour since they will see no packets hitting this
chain until they add the first rule that refers to conntrack.

This patch adds new ->init() and ->free() indirections to chain types
that can be used by NAT chains to invoke the conntrack dependency.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/nft_chain_nat_ipv4.c | 12 ++++++++++++
 net/ipv6/netfilter/nft_chain_nat_ipv6.c | 12 ++++++++++++
 net/netfilter/nf_tables_api.c           | 24 +++++++++++++++++-------
 3 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
index 9864f5b3279c..b5464a3f253b 100644
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
@@ -67,6 +67,16 @@ static unsigned int nft_nat_ipv4_local_fn(void *priv,
 	return nf_nat_ipv4_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
+static int nft_nat_ipv4_init(struct nft_ctx *ctx)
+{
+	return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_nat_ipv4_free(struct nft_ctx *ctx)
+{
+	nf_ct_netns_put(ctx->net, ctx->family);
+}
+
 static const struct nft_chain_type nft_chain_nat_ipv4 = {
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
@@ -82,6 +92,8 @@ static const struct nft_chain_type nft_chain_nat_ipv4 = {
 		[NF_INET_LOCAL_OUT]	= nft_nat_ipv4_local_fn,
 		[NF_INET_LOCAL_IN]	= nft_nat_ipv4_fn,
 	},
+	.init		= nft_nat_ipv4_init,
+	.free		= nft_nat_ipv4_free,
 };
 
 static int __init nft_chain_nat_init(void)
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
index c95d9a97d425..3557b114446c 100644
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
@@ -65,6 +65,16 @@ static unsigned int nft_nat_ipv6_local_fn(void *priv,
 	return nf_nat_ipv6_local_fn(priv, skb, state, nft_nat_do_chain);
 }
 
+static int nft_nat_ipv6_init(struct nft_ctx *ctx)
+{
+	return nf_ct_netns_get(ctx->net, ctx->family);
+}
+
+static void nft_nat_ipv6_free(struct nft_ctx *ctx)
+{
+	nf_ct_netns_put(ctx->net, ctx->family);
+}
+
 static const struct nft_chain_type nft_chain_nat_ipv6 = {
 	.name		= "nat",
 	.type		= NFT_CHAIN_T_NAT,
@@ -80,6 +90,8 @@ static const struct nft_chain_type nft_chain_nat_ipv6 = {
 		[NF_INET_LOCAL_OUT]	= nft_nat_ipv6_local_fn,
 		[NF_INET_LOCAL_IN]	= nft_nat_ipv6_fn,
 	},
+	.init		= nft_nat_ipv6_init,
+	.free		= nft_nat_ipv6_free,
 };
 
 static int __init nft_chain_nat_ipv6_init(void)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 97ec1c388bfe..af8b6a7488bd 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1211,13 +1211,17 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain,
 		rcu_assign_pointer(chain->stats, newstats);
 }
 
-static void nf_tables_chain_destroy(struct nft_chain *chain)
+static void nf_tables_chain_destroy(struct nft_ctx *ctx)
 {
+	struct nft_chain *chain = ctx->chain;
+
 	BUG_ON(chain->use > 0);
 
 	if (nft_is_base_chain(chain)) {
 		struct nft_base_chain *basechain = nft_base_chain(chain);
 
+		if (basechain->type->free)
+			basechain->type->free(ctx);
 		module_put(basechain->type->owner);
 		free_percpu(basechain->stats);
 		if (basechain->stats)
@@ -1354,6 +1358,9 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		}
 
 		basechain->type = hook.type;
+		if (basechain->type->init)
+			basechain->type->init(ctx);
+
 		chain = &basechain->chain;
 
 		ops		= &basechain->ops;
@@ -1374,6 +1381,8 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 		if (chain == NULL)
 			return -ENOMEM;
 	}
+	ctx->chain = chain;
+
 	INIT_LIST_HEAD(&chain->rules);
 	chain->handle = nf_tables_alloc_handle(table);
 	chain->table = table;
@@ -1387,7 +1396,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 	if (err < 0)
 		goto err1;
 
-	ctx->chain = chain;
 	err = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
 	if (err < 0)
 		goto err2;
@@ -1399,7 +1407,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
 err2:
 	nf_tables_unregister_hook(net, table, chain);
 err1:
-	nf_tables_chain_destroy(chain);
+	nf_tables_chain_destroy(ctx);
 
 	return err;
 }
@@ -5678,7 +5686,7 @@ static void nf_tables_commit_release(struct nft_trans *trans)
 		nf_tables_table_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_DELCHAIN:
-		nf_tables_chain_destroy(trans->ctx.chain);
+		nf_tables_chain_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_DELRULE:
 		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
@@ -5849,7 +5857,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
 		nf_tables_table_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_NEWCHAIN:
-		nf_tables_chain_destroy(trans->ctx.chain);
+		nf_tables_chain_destroy(&trans->ctx);
 		break;
 	case NFT_MSG_NEWRULE:
 		nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
@@ -6499,7 +6507,7 @@ int __nft_release_basechain(struct nft_ctx *ctx)
 	}
 	list_del(&ctx->chain->list);
 	ctx->table->use--;
-	nf_tables_chain_destroy(ctx->chain);
+	nf_tables_chain_destroy(ctx);
 
 	return 0;
 }
@@ -6515,6 +6523,7 @@ static void __nft_release_tables(struct net *net)
 	struct nft_set *set, *ns;
 	struct nft_ctx ctx = {
 		.net	= net,
+		.family	= NFPROTO_NETDEV,
 	};
 
 	list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
@@ -6551,9 +6560,10 @@ static void __nft_release_tables(struct net *net)
 			nft_obj_destroy(obj);
 		}
 		list_for_each_entry_safe(chain, nc, &table->chains, list) {
+			ctx.chain = chain;
 			list_del(&chain->list);
 			table->use--;
-			nf_tables_chain_destroy(chain);
+			nf_tables_chain_destroy(&ctx);
 		}
 		list_del(&table->list);
 		nf_tables_table_destroy(&ctx);
-- 
cgit v1.2.3


From 10659cbab72b7bfee1a886018d1915a9549b6378 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 28 Mar 2018 12:06:49 +0200
Subject: netfilter: nf_tables: rename to nft_set_lookup_global()

To prepare shorter introduction of shorter function prefix.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 12 ++++++------
 net/netfilter/nft_dynset.c    |  5 +++--
 net/netfilter/nft_lookup.c    |  4 ++--
 net/netfilter/nft_objref.c    |  5 +++--
 4 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index af8b6a7488bd..769d84015073 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2633,11 +2633,11 @@ static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
 	return ERR_PTR(-ENOENT);
 }
 
-struct nft_set *nft_set_lookup(const struct net *net,
-			       const struct nft_table *table,
-			       const struct nlattr *nla_set_name,
-			       const struct nlattr *nla_set_id,
-			       u8 genmask)
+struct nft_set *nft_set_lookup_global(const struct net *net,
+				      const struct nft_table *table,
+				      const struct nlattr *nla_set_name,
+				      const struct nlattr *nla_set_id,
+				      u8 genmask)
 {
 	struct nft_set *set;
 
@@ -2650,7 +2650,7 @@ struct nft_set *nft_set_lookup(const struct net *net,
 	}
 	return set;
 }
-EXPORT_SYMBOL_GPL(nft_set_lookup);
+EXPORT_SYMBOL_GPL(nft_set_lookup_global);
 
 static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
 				    const char *name)
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index fc83e29d6634..04863fad05dd 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -132,8 +132,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 			priv->invert = true;
 	}
 
-	set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_DYNSET_SET_NAME],
-			     tb[NFTA_DYNSET_SET_ID], genmask);
+	set = nft_set_lookup_global(ctx->net, ctx->table,
+				    tb[NFTA_DYNSET_SET_NAME],
+				    tb[NFTA_DYNSET_SET_ID], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 475570e89ede..f52da5e2199f 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -71,8 +71,8 @@ static int nft_lookup_init(const struct nft_ctx *ctx,
 	    tb[NFTA_LOOKUP_SREG] == NULL)
 		return -EINVAL;
 
-	set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET],
-			     tb[NFTA_LOOKUP_SET_ID], genmask);
+	set = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET],
+				    tb[NFTA_LOOKUP_SET_ID], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 7bcdc48f3d73..0b02407773ad 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -117,8 +117,9 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
 	struct nft_set *set;
 	int err;
 
-	set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_OBJREF_SET_NAME],
-			     tb[NFTA_OBJREF_SET_ID], genmask);
+	set = nft_set_lookup_global(ctx->net, ctx->table,
+				    tb[NFTA_OBJREF_SET_NAME],
+				    tb[NFTA_OBJREF_SET_ID], genmask);
 	if (IS_ERR(set))
 		return PTR_ERR(set);
 
-- 
cgit v1.2.3


From a3073c17dd8cd041d5cf68f28a80a54e310f2f45 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 28 Mar 2018 12:06:50 +0200
Subject: netfilter: nf_tables: use nft_set_lookup_global from
 nf_tables_newsetelem()

Replace opencoded implementation of nft_set_lookup_global() by call to
this function.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 769d84015073..2bd80fa9b070 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4032,17 +4032,10 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
 	if (err < 0)
 		return err;
 
-	set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
-				   genmask);
-	if (IS_ERR(set)) {
-		if (nla[NFTA_SET_ELEM_LIST_SET_ID]) {
-			set = nf_tables_set_lookup_byid(net,
-					nla[NFTA_SET_ELEM_LIST_SET_ID],
-					genmask);
-		}
-		if (IS_ERR(set))
-			return PTR_ERR(set);
-	}
+	set = nft_set_lookup_global(net, ctx.table, nla[NFTA_SET_ELEM_LIST_SET],
+				    nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
 
 	if (!list_empty(&set->bindings) && set->flags & NFT_SET_CONSTANT)
 		return -EBUSY;
-- 
cgit v1.2.3


From c47d36b3855d804b2e282f9b4eecbbd19b5453f9 Mon Sep 17 00:00:00 2001
From: Arushi Singhal <arushisinghal19971997@gmail.com>
Date: Thu, 29 Mar 2018 00:39:50 +0530
Subject: netfilter: Merge assignment with return

Merge assignment with return statement to directly return the value.

Signed-off-by: Arushi Singhal <arushisinghal19971997@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 5 ++---
 net/netfilter/xt_hashlimit.c         | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 11ef85a57244..b00e84bf4107 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1527,9 +1527,8 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
 	if (ret < 0)
 		return ret;
 
-	ret = ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC,
-					cda[CTA_NAT_SRC]);
-	return ret;
+	return ctnetlink_parse_nat_setup(ct, NF_NAT_MANIP_SRC,
+					 cda[CTA_NAT_SRC]);
 #else
 	if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
 		return 0;
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index db2fe0911740..64fc3721d74c 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -534,8 +534,7 @@ static u64 user2rate_bytes(u32 user)
 	u64 r;
 
 	r = user ? U32_MAX / user : U32_MAX;
-	r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
-	return r;
+	return (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
 }
 
 static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now,
-- 
cgit v1.2.3


From 9ba5c404bf1d6284f0269411b33394362b7ff405 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Thu, 29 Mar 2018 15:12:41 +0100
Subject: netfilter: x_tables: Add note about how to free percpu counters

Due to the way percpu counters are allocated and freed in blocks,
it is not safe to free counters individually.  Currently all callers
do the right thing, but let's note this restriction.

Fixes: ae0ac0ed6fcf ("netfilter: x_tables: pack percpu counter allocations")
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index bac932f1c582..75cd5196b29b 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1854,7 +1854,9 @@ EXPORT_SYMBOL_GPL(xt_proto_fini);
  * to fetch the real percpu counter.
  *
  * To speed up allocation and improve data locality, a 4kb block is
- * allocated.
+ * allocated.  Freeing any counter may free an entire block, so all
+ * counters allocated using the same state must be freed at the same
+ * time.
  *
  * xt_percpu_counter_alloc_state contains the base address of the
  * allocated page and the current sub-offset.
-- 
cgit v1.2.3


From e3b5e1ec75234fb6b27708a316cdf69f9fb176a8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 30 Mar 2018 11:39:12 +0200
Subject: Revert "netfilter: x_tables: ensure last rule in base chain matches
 underflow/policy"

This reverts commit 0d7df906a0e78079a02108b06d32c3ef2238ad25.

Valdis Kletnieks reported that xtables is broken in linux-next since
0d7df906a0e78  ("netfilter: x_tables: ensure last rule in base chain
matches underflow/policy"), as kernel rejects the (well-formed) ruleset:

[   64.402790] ip6_tables: last base chain position 1136 doesn't match underflow 1344 (hook 1)

mark_source_chains is not the correct place for such a check, as it
terminates evaluation of a chain once it sees an unconditional verdict
(following rules are known to be unreachable). It seems preferrable to
fix libiptc instead, so remove this check again.

Fixes: 0d7df906a0e78 ("netfilter: x_tables: ensure last rule in base chain matches underflow/policy")
Reported-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv4/netfilter/arp_tables.c | 17 +----------------
 net/ipv4/netfilter/ip_tables.c  | 17 +----------------
 net/ipv6/netfilter/ip6_tables.c | 17 +----------------
 3 files changed, 3 insertions(+), 48 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index f366ff1cfc19..aaafdbd15ad3 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -309,13 +309,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 	for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct arpt_entry *e = entry0 + pos;
-		unsigned int last_pos, depth;
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
 
-		depth = 0;
-		last_pos = pos;
 		/* Set initial back pointer. */
 		e->counters.pcnt = pos;
 
@@ -346,8 +343,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 					pos = e->counters.pcnt;
 					e->counters.pcnt = 0;
 
-					if (depth)
-						--depth;
 					/* We're at the start. */
 					if (pos == oldpos)
 						goto next;
@@ -372,9 +367,6 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
-
-					if (entry0 + newpos != arpt_next_entry(e))
-						++depth;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
@@ -385,15 +377,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
-			if (depth == 0)
-				last_pos = pos;
-		}
-next:
-		if (last_pos != newinfo->underflow[hook]) {
-			pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n",
-					   last_pos, newinfo->underflow[hook], hook);
-			return 0;
 		}
+next:		;
 	}
 	return 1;
 }
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2362ca2c9e0c..f9063513f9d1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -378,13 +378,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ipt_entry *e = entry0 + pos;
-		unsigned int last_pos, depth;
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
 
-		depth = 0;
-		last_pos = pos;
 		/* Set initial back pointer. */
 		e->counters.pcnt = pos;
 
@@ -413,8 +410,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					pos = e->counters.pcnt;
 					e->counters.pcnt = 0;
 
-					if (depth)
-						--depth;
 					/* We're at the start. */
 					if (pos == oldpos)
 						goto next;
@@ -439,9 +434,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
-
-					if (entry0 + newpos != ipt_next_entry(e))
-						++depth;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
@@ -452,15 +444,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
-			if (depth == 0)
-				last_pos = pos;
-		}
-next:
-		if (last_pos != newinfo->underflow[hook]) {
-			pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n",
-					   last_pos, newinfo->underflow[hook], hook);
-			return 0;
 		}
+next:		;
 	}
 	return 1;
 }
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 004508753abc..3c36a4c77f29 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -396,13 +396,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ip6t_entry *e = entry0 + pos;
-		unsigned int last_pos, depth;
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
 
-		depth = 0;
-		last_pos = pos;
 		/* Set initial back pointer. */
 		e->counters.pcnt = pos;
 
@@ -431,8 +428,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					pos = e->counters.pcnt;
 					e->counters.pcnt = 0;
 
-					if (depth)
-						--depth;
 					/* We're at the start. */
 					if (pos == oldpos)
 						goto next;
@@ -457,9 +452,6 @@ mark_source_chains(const struct xt_table_info *newinfo,
 					if (!xt_find_jump_offset(offsets, newpos,
 								 newinfo->number))
 						return 0;
-
-					if (entry0 + newpos != ip6t_next_entry(e))
-						++depth;
 				} else {
 					/* ... this is a fallthru */
 					newpos = pos + e->next_offset;
@@ -470,15 +462,8 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
-			if (depth == 0)
-				last_pos = pos;
-		}
-next:
-		if (last_pos != newinfo->underflow[hook]) {
-			pr_err_ratelimited("last base chain position %u doesn't match underflow %u (hook %u)\n",
-					   last_pos, newinfo->underflow[hook], hook);
-			return 0;
 		}
+next:		;
 	}
 	return 1;
 }
-- 
cgit v1.2.3


From 26c97c5d8dac6bc56d4360561a286f52543ac07e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 20 Mar 2018 10:35:47 -0700
Subject: netfilter: ipset: Use is_zero_ether_addr instead of static and memcmp

To make the test a bit clearer and to reduce object size a little.

Miscellanea:

o remove now unnecessary static const array

$ size ip_set_hash_mac.o*
   text	   data	    bss	    dec	    hex	filename
  22822	   4619	     64	  27505	   6b71	ip_set_hash_mac.o.allyesconfig.new
  22932	   4683	     64	  27679	   6c1f	ip_set_hash_mac.o.allyesconfig.old
  10443	   1040	      0	  11483	   2cdb	ip_set_hash_mac.o.defconfig.new
  10507	   1040	      0	  11547	   2d1b	ip_set_hash_mac.o.defconfig.old

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipset/ip_set_hash_mac.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index 8f004edad396..f9d5a2a1e3d0 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -72,9 +72,6 @@ hash_mac4_data_next(struct hash_mac4_elem *next,
 #define IP_SET_PROTO_UNDEF
 #include "ip_set_hash_gen.h"
 
-/* Zero valued element is not supported */
-static const unsigned char invalid_ether[ETH_ALEN] = { 0 };
-
 static int
 hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb,
 	       const struct xt_action_param *par,
@@ -93,7 +90,7 @@ hash_mac4_kadt(struct ip_set *set, const struct sk_buff *skb,
 		return -EINVAL;
 
 	ether_addr_copy(e.ether, eth_hdr(skb)->h_source);
-	if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
+	if (is_zero_ether_addr(e.ether))
 		return -EINVAL;
 	return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
 }
@@ -118,7 +115,7 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
 	if (ret)
 		return ret;
 	ether_addr_copy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]));
-	if (memcmp(e.ether, invalid_ether, ETH_ALEN) == 0)
+	if (is_zero_ether_addr(e.ether))
 		return -IPSET_ERR_HASH_ELEM;
 
 	return adtfn(set, &e, &ext, &ext, flags);
-- 
cgit v1.2.3


From 9daae9bd47cff82a2a06aca23c458d6c79d09d52 Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Wed, 28 Mar 2018 17:46:54 +0300
Subject: net: Call add/kill vid ndo on vlan filter feature toggling

NETIF_F_HW_VLAN_[CS]TAG_FILTER features require more than just a bit
flip in dev->features in order to keep the driver in a consistent state.
These features notify the driver of each added/removed vlan, but toggling
of vlan-filter does not notify the driver accordingly for each of the
existing vlans.

This patch implements a similar solution to NETIF_F_RX_UDP_TUNNEL_PORT
behavior (which notifies the driver about UDP ports in the same manner
that vids are reported).

Each toggling of the features propagates to the 8021q module, which
iterates over the vlans and call add/kill ndo accordingly.

Signed-off-by: Gal Pressman <galp@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c      |  21 +++++++++++
 net/8021q/vlan.h      |   3 ++
 net/8021q/vlan_core.c | 101 +++++++++++++++++++++++++++++++++++++-------------
 net/core/dev.c        |  20 ++++++++++
 4 files changed, 120 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index bad01b14a4ad..5505ee6ebdbe 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -360,6 +360,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	struct vlan_dev_priv *vlan;
 	bool last = false;
 	LIST_HEAD(list);
+	int err;
 
 	if (is_vlan_dev(dev)) {
 		int err = __vlan_device_event(dev, event);
@@ -489,6 +490,26 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		vlan_group_for_each_dev(grp, i, vlandev)
 			call_netdevice_notifiers(event, vlandev);
 		break;
+
+	case NETDEV_CVLAN_FILTER_PUSH_INFO:
+		err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q));
+		if (err)
+			return notifier_from_errno(err);
+		break;
+
+	case NETDEV_CVLAN_FILTER_DROP_INFO:
+		vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q));
+		break;
+
+	case NETDEV_SVLAN_FILTER_PUSH_INFO:
+		err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD));
+		if (err)
+			return notifier_from_errno(err);
+		break;
+
+	case NETDEV_SVLAN_FILTER_DROP_INFO:
+		vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD));
+		break;
 	}
 
 out:
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index a8ba51030b75..e23aac3e4d37 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -97,6 +97,9 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
 		if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
 							    (i) % VLAN_N_VID)))
 
+int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto);
+void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto);
+
 /* found in vlan_dev.c */
 void vlan_dev_set_ingress_priority(const struct net_device *dev,
 				   u32 skb_prio, u16 vlan_prio);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 45c9bf5ff3a0..c8d7abdc0463 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -165,13 +165,12 @@ struct vlan_vid_info {
 	int refcount;
 };
 
-static bool vlan_hw_filter_capable(const struct net_device *dev,
-				     const struct vlan_vid_info *vid_info)
+bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
 {
-	if (vid_info->proto == htons(ETH_P_8021Q) &&
+	if (proto == htons(ETH_P_8021Q) &&
 	    dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
 		return true;
-	if (vid_info->proto == htons(ETH_P_8021AD) &&
+	if (proto == htons(ETH_P_8021AD) &&
 	    dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
 		return true;
 	return false;
@@ -202,11 +201,73 @@ static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
 	return vid_info;
 }
 
+static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
+{
+	if (!vlan_hw_filter_capable(dev, proto))
+		return 0;
+
+	if (netif_device_present(dev))
+		return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid);
+	else
+		return -ENODEV;
+}
+
+static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
+{
+	if (!vlan_hw_filter_capable(dev, proto))
+		return 0;
+
+	if (netif_device_present(dev))
+		return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
+	else
+		return -ENODEV;
+}
+
+int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
+{
+	struct net_device *real_dev = vlan_info->real_dev;
+	struct vlan_vid_info *vlan_vid_info;
+	int err;
+
+	list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) {
+		if (vlan_vid_info->proto == proto) {
+			err = vlan_add_rx_filter_info(real_dev, proto,
+						      vlan_vid_info->vid);
+			if (err)
+				goto unwind;
+		}
+	}
+
+	return 0;
+
+unwind:
+	list_for_each_entry_continue_reverse(vlan_vid_info,
+					     &vlan_info->vid_list, list) {
+		if (vlan_vid_info->proto == proto)
+			vlan_kill_rx_filter_info(real_dev, proto,
+						 vlan_vid_info->vid);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(vlan_filter_push_vids);
+
+void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto)
+{
+	struct vlan_vid_info *vlan_vid_info;
+
+	list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list)
+		if (vlan_vid_info->proto == proto)
+			vlan_kill_rx_filter_info(vlan_info->real_dev,
+						 vlan_vid_info->proto,
+						 vlan_vid_info->vid);
+}
+EXPORT_SYMBOL(vlan_filter_drop_vids);
+
 static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
 			  struct vlan_vid_info **pvid_info)
 {
 	struct net_device *dev = vlan_info->real_dev;
-	const struct net_device_ops *ops = dev->netdev_ops;
 	struct vlan_vid_info *vid_info;
 	int err;
 
@@ -214,16 +275,12 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
 	if (!vid_info)
 		return -ENOMEM;
 
-	if (vlan_hw_filter_capable(dev, vid_info)) {
-		if (netif_device_present(dev))
-			err = ops->ndo_vlan_rx_add_vid(dev, proto, vid);
-		else
-			err = -ENODEV;
-		if (err) {
-			kfree(vid_info);
-			return err;
-		}
+	err = vlan_add_rx_filter_info(dev, proto, vid);
+	if (err) {
+		kfree(vid_info);
+		return err;
 	}
+
 	list_add(&vid_info->list, &vlan_info->vid_list);
 	vlan_info->nr_vids++;
 	*pvid_info = vid_info;
@@ -270,21 +327,15 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
 			   struct vlan_vid_info *vid_info)
 {
 	struct net_device *dev = vlan_info->real_dev;
-	const struct net_device_ops *ops = dev->netdev_ops;
 	__be16 proto = vid_info->proto;
 	u16 vid = vid_info->vid;
 	int err;
 
-	if (vlan_hw_filter_capable(dev, vid_info)) {
-		if (netif_device_present(dev))
-			err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
-		else
-			err = -ENODEV;
-		if (err) {
-			pr_warn("failed to kill vid %04x/%d for device %s\n",
-				proto, vid, dev->name);
-		}
-	}
+	err = vlan_kill_rx_filter_info(dev, proto, vid);
+	if (err)
+		pr_warn("failed to kill vid %04x/%d for device %s\n",
+			proto, vid, dev->name);
+
 	list_del(&vid_info->list);
 	kfree(vid_info);
 	vlan_info->nr_vids--;
diff --git a/net/core/dev.c b/net/core/dev.c
index eca5458b2753..1ddb6b9c58a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1584,6 +1584,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
 	N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
 	N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
 	N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+	N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
+	N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
 	};
 #undef N
 	return "UNKNOWN_NETDEV_EVENT";
@@ -7665,6 +7667,24 @@ sync_lower:
 			}
 		}
 
+		if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
+			if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+				dev->features = features;
+				err |= vlan_get_rx_ctag_filter_info(dev);
+			} else {
+				vlan_drop_rx_ctag_filter_info(dev);
+			}
+		}
+
+		if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
+			if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
+				dev->features = features;
+				err |= vlan_get_rx_stag_filter_info(dev);
+			} else {
+				vlan_drop_rx_stag_filter_info(dev);
+			}
+		}
+
 		dev->features = features;
 	}
 
-- 
cgit v1.2.3


From e679c9c1dbfdba07b2a979a076cca74b773be8ce Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 28 Mar 2018 15:44:16 -0700
Subject: sfp/phylink: move module EEPROM ethtool access into netdev core
 ethtool

Provide a pointer to the SFP bus in struct net_device, so that the
ethtool module EEPROM methods can access the SFP directly, rather
than needing every user to provide a hook for it.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index bb6e498c6e3d..eb55252ca1fb 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -22,6 +22,7 @@
 #include <linux/bitops.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
+#include <linux/sfp.h>
 #include <linux/slab.h>
 #include <linux/rtnetlink.h>
 #include <linux/sched/signal.h>
@@ -2245,6 +2246,9 @@ static int __ethtool_get_module_info(struct net_device *dev,
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 	struct phy_device *phydev = dev->phydev;
 
+	if (dev->sfp_bus)
+		return sfp_get_module_info(dev->sfp_bus, modinfo);
+
 	if (phydev && phydev->drv && phydev->drv->module_info)
 		return phydev->drv->module_info(phydev, modinfo);
 
@@ -2279,6 +2283,9 @@ static int __ethtool_get_module_eeprom(struct net_device *dev,
 	const struct ethtool_ops *ops = dev->ethtool_ops;
 	struct phy_device *phydev = dev->phydev;
 
+	if (dev->sfp_bus)
+		return sfp_get_module_eeprom(dev->sfp_bus, ee, data);
+
 	if (phydev && phydev->drv && phydev->drv->module_eeprom)
 		return phydev->drv->module_eeprom(phydev, ee, data);
 
-- 
cgit v1.2.3


From e9a441b6e729e16092fcc18e3962b952a01d1e3c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 29 Mar 2018 17:03:25 +0300
Subject: xfrm: Register xfrm_dev_notifier in appropriate place

Currently, driver registers it from pernet_operations::init method,
and this breaks modularity, because initialization of net namespace
and netdevice notifiers are orthogonal actions. We don't have
per-namespace netdevice notifiers; all of them are global for all
devices in all namespaces.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_device.c | 2 +-
 net/xfrm/xfrm_policy.c | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index e87d6c4dd5b6..175941e15a6e 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -350,7 +350,7 @@ static struct notifier_block xfrm_dev_notifier = {
 	.notifier_call	= xfrm_dev_event,
 };
 
-void __net_init xfrm_dev_init(void)
+void __init xfrm_dev_init(void)
 {
 	register_netdevice_notifier(&xfrm_dev_notifier);
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 0e065db6c7c0..40b54cc64243 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2895,8 +2895,6 @@ static int __net_init xfrm_policy_init(struct net *net)
 	INIT_LIST_HEAD(&net->xfrm.policy_all);
 	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
 	INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
-	if (net_eq(net, &init_net))
-		xfrm_dev_init();
 	return 0;
 
 out_bydst:
@@ -2999,6 +2997,7 @@ void __init xfrm_init(void)
 		INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
 
 	register_pernet_subsys(&xfrm_net_ops);
+	xfrm_dev_init();
 	seqcount_init(&xfrm_policy_hash_generation);
 	xfrm_input_init();
 }
-- 
cgit v1.2.3


From 9e2f6c5d78db6647eb9d7bfeb20b9e0f9ff2c56c Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 29 Mar 2018 17:03:35 +0300
Subject: netfilter: Rework xt_TEE netdevice notifier

Register netdevice notifier for every iptable entry
is not good, since this breaks modularity, and
the hidden synchronization is based on rtnl_lock().

This patch reworks the synchronization via new lock,
while the rest of logic remains as it was before.
This is required for the next patch.

Tested via:

while :; do
	unshare -n iptables -t mangle -A OUTPUT -j TEE --gateway 1.1.1.2 --oif lo;
done

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/xt_TEE.c | 73 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 86b0580b2216..475957cfcf50 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -20,7 +20,7 @@
 #include <linux/netfilter/xt_TEE.h>
 
 struct xt_tee_priv {
-	struct notifier_block	notifier;
+	struct list_head	list;
 	struct xt_tee_tginfo	*tginfo;
 	int			oif;
 };
@@ -51,29 +51,35 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 }
 #endif
 
+static DEFINE_MUTEX(priv_list_mutex);
+static LIST_HEAD(priv_list);
+
 static int tee_netdev_event(struct notifier_block *this, unsigned long event,
 			    void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct xt_tee_priv *priv;
 
-	priv = container_of(this, struct xt_tee_priv, notifier);
-	switch (event) {
-	case NETDEV_REGISTER:
-		if (!strcmp(dev->name, priv->tginfo->oif))
-			priv->oif = dev->ifindex;
-		break;
-	case NETDEV_UNREGISTER:
-		if (dev->ifindex == priv->oif)
-			priv->oif = -1;
-		break;
-	case NETDEV_CHANGENAME:
-		if (!strcmp(dev->name, priv->tginfo->oif))
-			priv->oif = dev->ifindex;
-		else if (dev->ifindex == priv->oif)
-			priv->oif = -1;
-		break;
+	mutex_lock(&priv_list_mutex);
+	list_for_each_entry(priv, &priv_list, list) {
+		switch (event) {
+		case NETDEV_REGISTER:
+			if (!strcmp(dev->name, priv->tginfo->oif))
+				priv->oif = dev->ifindex;
+			break;
+		case NETDEV_UNREGISTER:
+			if (dev->ifindex == priv->oif)
+				priv->oif = -1;
+			break;
+		case NETDEV_CHANGENAME:
+			if (!strcmp(dev->name, priv->tginfo->oif))
+				priv->oif = dev->ifindex;
+			else if (dev->ifindex == priv->oif)
+				priv->oif = -1;
+			break;
+		}
 	}
+	mutex_unlock(&priv_list_mutex);
 
 	return NOTIFY_DONE;
 }
@@ -89,8 +95,6 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
 		return -EINVAL;
 
 	if (info->oif[0]) {
-		int ret;
-
 		if (info->oif[sizeof(info->oif)-1] != '\0')
 			return -EINVAL;
 
@@ -100,14 +104,11 @@ static int tee_tg_check(const struct xt_tgchk_param *par)
 
 		priv->tginfo  = info;
 		priv->oif     = -1;
-		priv->notifier.notifier_call = tee_netdev_event;
 		info->priv    = priv;
 
-		ret = register_netdevice_notifier(&priv->notifier);
-		if (ret) {
-			kfree(priv);
-			return ret;
-		}
+		mutex_lock(&priv_list_mutex);
+		list_add(&priv->list, &priv_list);
+		mutex_unlock(&priv_list_mutex);
 	} else
 		info->priv = NULL;
 
@@ -120,7 +121,9 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par)
 	struct xt_tee_tginfo *info = par->targinfo;
 
 	if (info->priv) {
-		unregister_netdevice_notifier(&info->priv->notifier);
+		mutex_lock(&priv_list_mutex);
+		list_del(&info->priv->list);
+		mutex_unlock(&priv_list_mutex);
 		kfree(info->priv);
 	}
 	static_key_slow_dec(&xt_tee_enabled);
@@ -153,13 +156,29 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
 #endif
 };
 
+static struct notifier_block tee_netdev_notifier = {
+	.notifier_call = tee_netdev_event,
+};
+
 static int __init tee_tg_init(void)
 {
-	return xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+	int ret;
+
+	ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+	if (ret)
+		return ret;
+	ret = register_netdevice_notifier(&tee_netdev_notifier);
+	if (ret) {
+		xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
+		return ret;
+	}
+
+	return 0;
 }
 
 static void __exit tee_tg_exit(void)
 {
+	unregister_netdevice_notifier(&tee_netdev_notifier);
 	xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
 }
 
-- 
cgit v1.2.3


From 328fbe747ad4622f0dfa98d2e19e836f03f80c04 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Thu, 29 Mar 2018 17:03:45 +0300
Subject: net: Close race between {un, }register_netdevice_notifier() and
 setup_net()/cleanup_net()

{un,}register_netdevice_notifier() iterate over all net namespaces
hashed to net_namespace_list. But pernet_operations register and
unregister netdevices in unhashed net namespace, and they are not
seen for netdevice notifiers. This results in asymmetry:

1)Race with register_netdevice_notifier()
  pernet_operations::init(net)	...
   register_netdevice()		...
    call_netdevice_notifiers()  ...
      ... nb is not called ...
  ...				register_netdevice_notifier(nb) -> net skipped
  ...				...
  list_add_tail(&net->list, ..) ...

  Then, userspace stops using net, and it's destructed:

  pernet_operations::exit(net)
   unregister_netdevice()
    call_netdevice_notifiers()
      ... nb is called ...

This always happens with net::loopback_dev, but it may be not the only device.

2)Race with unregister_netdevice_notifier()
  pernet_operations::init(net)
   register_netdevice()
    call_netdevice_notifiers()
      ... nb is called ...

  Then, userspace stops using net, and it's destructed:

  list_del_rcu(&net->list)	...
  pernet_operations::exit(net)  unregister_netdevice_notifier(nb) -> net skipped
   dev_change_net_namespace()	...
    call_netdevice_notifiers()
      ... nb is not called ...
   unregister_netdevice()
    call_netdevice_notifiers()
      ... nb is not called ...

This race is more danger, since dev_change_net_namespace() moves real
network devices, which use not trivial netdevice notifiers, and if this
will happen, the system will be left in unpredictable state.

The patch closes the race. During the testing I found two places,
where register_netdevice_notifier() is called from pernet init/exit
methods (which led to deadlock) and fixed them (see previous patches).

The review moved me to one more unusual registration place:
raw_init() (can driver). It may be a reason of problems,
if someone creates in-kernel CAN_RAW sockets, since they
will be destroyed in exit method and raw_release()
will call unregister_netdevice_notifier(). But grep over
kernel tree does not show, someone creates such sockets
from kernel space.

Theoretically, there can be more places like this, and which are
hidden from review, but we found them on the first bumping there
(since there is no a race, it will be 100% reproducible).

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 1ddb6b9c58a8..07da7add4845 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1625,6 +1625,8 @@ int register_netdevice_notifier(struct notifier_block *nb)
 	struct net *net;
 	int err;
 
+	/* Close race with setup_net() and cleanup_net() */
+	down_write(&pernet_ops_rwsem);
 	rtnl_lock();
 	err = raw_notifier_chain_register(&netdev_chain, nb);
 	if (err)
@@ -1649,6 +1651,7 @@ int register_netdevice_notifier(struct notifier_block *nb)
 
 unlock:
 	rtnl_unlock();
+	up_write(&pernet_ops_rwsem);
 	return err;
 
 rollback:
@@ -1694,6 +1697,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	struct net *net;
 	int err;
 
+	/* Close race with setup_net() and cleanup_net() */
+	down_write(&pernet_ops_rwsem);
 	rtnl_lock();
 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
 	if (err)
@@ -1713,6 +1718,7 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	up_read(&net_rwsem);
 unlock:
 	rtnl_unlock();
+	up_write(&pernet_ops_rwsem);
 	return err;
 }
 EXPORT_SYMBOL(unregister_netdevice_notifier);
-- 
cgit v1.2.3


From 428604fb118facce1309670779a35baf27ad044c Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Date: Thu, 29 Mar 2018 11:02:24 +0200
Subject: ipv6: do not set routes if disable_ipv6 has been enabled

Do not allow setting ipv6 routes from userspace if disable_ipv6 has been
enabled. The issue can be triggered using the following reproducer:

- sysctl net.ipv6.conf.all.disable_ipv6=1
- ip -6 route add a:b:c:d::/64 dev em1
- ip -6 route show
  a:b:c:d::/64 dev em1 metric 1024 pref medium

Fix it checking disable_ipv6 value in ip6_route_info_create routine

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net')

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ba8d5df50ebe..e461ef1158b6 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2917,6 +2917,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (!dev)
 		goto out;
 
+	if (idev->cnf.disable_ipv6) {
+		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
+		err = -EACCES;
+		goto out;
+	}
+
 	if (!(dev->flags & IFF_UP)) {
 		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
 		err = -ENETDOWN;
-- 
cgit v1.2.3


From ace45bec6d77bc061c3c3d8ad99e298ea9800c2b Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:04:43 +0100
Subject: rxrpc: Fix firewall route keepalive

Fix the firewall route keepalive part of AF_RXRPC which is currently
function incorrectly by replying to VERSION REPLY packets from the server
with VERSION REQUEST packets.

Instead, send VERSION REPLY packets to the peers of service connections to
act as keep-alives 20s after the latest packet was transmitted to that
peer.

Also, just discard VERSION REPLY packets rather than replying to them.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    |  4 +++
 net/rxrpc/ar-internal.h | 14 +++++++-
 net/rxrpc/conn_event.c  |  3 ++
 net/rxrpc/input.c       |  2 ++
 net/rxrpc/net_ns.c      | 21 ++++++++++-
 net/rxrpc/output.c      | 59 +++++++++++++++++++++++++++++-
 net/rxrpc/peer_event.c  | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/peer_object.c |  7 +++-
 net/rxrpc/rxkad.c       |  2 ++
 9 files changed, 204 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index ec5ec68be1aa..0b3026b8fa40 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -762,6 +762,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
 static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 			int kern)
 {
+	struct rxrpc_net *rxnet;
 	struct rxrpc_sock *rx;
 	struct sock *sk;
 
@@ -801,6 +802,9 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	rwlock_init(&rx->call_lock);
 	memset(&rx->srx, 0, sizeof(rx->srx));
 
+	rxnet = rxrpc_net(sock_net(&rx->sk));
+	timer_reduce(&rxnet->peer_keepalive_timer, jiffies + 1);
+
 	_leave(" = 0 [%p]", rx);
 	return 0;
 }
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 21cf164b6d85..8a348e0a9d95 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -97,8 +97,16 @@ struct rxrpc_net {
 	struct list_head	local_endpoints;
 	struct mutex		local_mutex;	/* Lock for ->local_endpoints */
 
-	spinlock_t		peer_hash_lock;	/* Lock for ->peer_hash */
 	DECLARE_HASHTABLE	(peer_hash, 10);
+	spinlock_t		peer_hash_lock;	/* Lock for ->peer_hash */
+
+#define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */
+	u8			peer_keepalive_cursor;
+	ktime_t			peer_keepalive_base;
+	struct hlist_head	peer_keepalive[RXRPC_KEEPALIVE_TIME + 1];
+	struct hlist_head	peer_keepalive_new;
+	struct timer_list	peer_keepalive_timer;
+	struct work_struct	peer_keepalive_work;
 };
 
 /*
@@ -285,6 +293,8 @@ struct rxrpc_peer {
 	struct hlist_head	error_targets;	/* targets for net error distribution */
 	struct work_struct	error_distributor;
 	struct rb_root		service_conns;	/* Service connections */
+	struct hlist_node	keepalive_link;	/* Link in net->peer_keepalive[] */
+	time64_t		last_tx_at;	/* Last time packet sent here */
 	seqlock_t		service_conn_lock;
 	spinlock_t		lock;		/* access lock */
 	unsigned int		if_mtu;		/* interface MTU for this peer */
@@ -1026,6 +1036,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *);
 int rxrpc_send_abort_packet(struct rxrpc_call *);
 int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
 void rxrpc_reject_packets(struct rxrpc_local *);
+void rxrpc_send_keepalive(struct rxrpc_peer *);
 
 /*
  * peer_event.c
@@ -1034,6 +1045,7 @@ void rxrpc_error_report(struct sock *);
 void rxrpc_peer_error_distributor(struct work_struct *);
 void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
 			rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
+void rxrpc_peer_keepalive_worker(struct work_struct *);
 
 /*
  * peer_object.c
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index d2ec3fd593e8..c717152070df 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -136,6 +136,7 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
 	}
 
 	kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len);
+	conn->params.peer->last_tx_at = ktime_get_real();
 	_leave("");
 	return;
 }
@@ -239,6 +240,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
 		return -EAGAIN;
 	}
 
+	conn->params.peer->last_tx_at = ktime_get_real();
+
 	_leave(" = 0");
 	return 0;
 }
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 2a868fdab0ae..d4f2509e018b 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1183,6 +1183,8 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
 	switch (sp->hdr.type) {
 	case RXRPC_PACKET_TYPE_VERSION:
+		if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED))
+			goto discard;
 		rxrpc_post_packet_to_local(local, skb);
 		goto out;
 
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index f18c9248e0d4..66baf2b80b6c 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -32,13 +32,22 @@ static void rxrpc_service_conn_reap_timeout(struct timer_list *timer)
 		rxrpc_queue_work(&rxnet->service_conn_reaper);
 }
 
+static void rxrpc_peer_keepalive_timeout(struct timer_list *timer)
+{
+	struct rxrpc_net *rxnet =
+		container_of(timer, struct rxrpc_net, peer_keepalive_timer);
+
+	if (rxnet->live)
+		rxrpc_queue_work(&rxnet->peer_keepalive_work);
+}
+
 /*
  * Initialise a per-network namespace record.
  */
 static __net_init int rxrpc_init_net(struct net *net)
 {
 	struct rxrpc_net *rxnet = rxrpc_net(net);
-	int ret;
+	int ret, i;
 
 	rxnet->live = true;
 	get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch));
@@ -70,8 +79,16 @@ static __net_init int rxrpc_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&rxnet->local_endpoints);
 	mutex_init(&rxnet->local_mutex);
+
 	hash_init(rxnet->peer_hash);
 	spin_lock_init(&rxnet->peer_hash_lock);
+	for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++)
+		INIT_HLIST_HEAD(&rxnet->peer_keepalive[i]);
+	INIT_HLIST_HEAD(&rxnet->peer_keepalive_new);
+	timer_setup(&rxnet->peer_keepalive_timer,
+		    rxrpc_peer_keepalive_timeout, 0);
+	INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker);
+	rxnet->peer_keepalive_base = ktime_add(ktime_get_real(), NSEC_PER_SEC);
 
 	ret = -ENOMEM;
 	rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net);
@@ -95,6 +112,8 @@ static __net_exit void rxrpc_exit_net(struct net *net)
 	struct rxrpc_net *rxnet = rxrpc_net(net);
 
 	rxnet->live = false;
+	del_timer_sync(&rxnet->peer_keepalive_timer);
+	cancel_work_sync(&rxnet->peer_keepalive_work);
 	rxrpc_destroy_all_calls(rxnet);
 	rxrpc_destroy_all_connections(rxnet);
 	rxrpc_destroy_all_locals(rxnet);
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index cf73dc006c3b..7f1fc04775b3 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -32,6 +32,8 @@ struct rxrpc_abort_buffer {
 	__be32 abort_code;
 };
 
+static const char rxrpc_keepalive_string[] = "";
+
 /*
  * Arrange for a keepalive ping a certain time after we last transmitted.  This
  * lets the far side know we're still interested in this call and helps keep
@@ -122,6 +124,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 	struct kvec iov[2];
 	rxrpc_serial_t serial;
 	rxrpc_seq_t hard_ack, top;
+	ktime_t now;
 	size_t len, n;
 	int ret;
 	u8 reason;
@@ -203,8 +206,10 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
 	}
 
 	ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+	now = ktime_get_real();
 	if (ping)
-		call->ping_time = ktime_get_real();
+		call->ping_time = now;
+	conn->params.peer->last_tx_at = ktime_get_real();
 
 	if (call->state < RXRPC_CALL_COMPLETE) {
 		if (ret < 0) {
@@ -288,6 +293,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
 
 	ret = kernel_sendmsg(conn->params.local->socket,
 			     &msg, iov, 1, sizeof(pkt));
+	conn->params.peer->last_tx_at = ktime_get_real();
 
 	rxrpc_put_connection(conn);
 	return ret;
@@ -378,6 +384,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
 	 *     message and update the peer record
 	 */
 	ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
+	conn->params.peer->last_tx_at = ktime_get_real();
 
 	up_read(&conn->params.local->defrag_sem);
 	if (ret == -EMSGSIZE)
@@ -429,6 +436,7 @@ send_fragmentable:
 		if (ret == 0) {
 			ret = kernel_sendmsg(conn->params.local->socket, &msg,
 					     iov, 2, len);
+			conn->params.peer->last_tx_at = ktime_get_real();
 
 			opt = IP_PMTUDISC_DO;
 			kernel_setsockopt(conn->params.local->socket, SOL_IP,
@@ -446,6 +454,7 @@ send_fragmentable:
 		if (ret == 0) {
 			ret = kernel_sendmsg(conn->params.local->socket, &msg,
 					     iov, 2, len);
+			conn->params.peer->last_tx_at = ktime_get_real();
 
 			opt = IPV6_PMTUDISC_DO;
 			kernel_setsockopt(conn->params.local->socket,
@@ -515,3 +524,51 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
 
 	_leave("");
 }
+
+/*
+ * Send a VERSION reply to a peer as a keepalive.
+ */
+void rxrpc_send_keepalive(struct rxrpc_peer *peer)
+{
+	struct rxrpc_wire_header whdr;
+	struct msghdr msg;
+	struct kvec iov[2];
+	size_t len;
+	int ret;
+
+	_enter("");
+
+	msg.msg_name	= &peer->srx.transport;
+	msg.msg_namelen	= peer->srx.transport_len;
+	msg.msg_control	= NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags	= 0;
+
+	whdr.epoch	= htonl(peer->local->rxnet->epoch);
+	whdr.cid	= 0;
+	whdr.callNumber	= 0;
+	whdr.seq	= 0;
+	whdr.serial	= 0;
+	whdr.type	= RXRPC_PACKET_TYPE_VERSION; /* Not client-initiated */
+	whdr.flags	= RXRPC_LAST_PACKET;
+	whdr.userStatus	= 0;
+	whdr.securityIndex = 0;
+	whdr._rsvd	= 0;
+	whdr.serviceId	= 0;
+
+	iov[0].iov_base	= &whdr;
+	iov[0].iov_len	= sizeof(whdr);
+	iov[1].iov_base	= (char *)rxrpc_keepalive_string;
+	iov[1].iov_len	= sizeof(rxrpc_keepalive_string);
+
+	len = iov[0].iov_len + iov[1].iov_len;
+
+	_proto("Tx VERSION (keepalive)");
+
+	ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len);
+	if (ret < 0)
+		_debug("sendmsg failed: %d", ret);
+
+	peer->last_tx_at = ktime_get_real();
+	_leave("");
+}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 7f749505e699..d01eb9a06448 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -348,3 +348,99 @@ void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
 	trace_rxrpc_rtt_rx(call, why, send_serial, resp_serial, rtt,
 			   usage, avg);
 }
+
+/*
+ * Perform keep-alive pings with VERSION packets to keep any NAT alive.
+ */
+void rxrpc_peer_keepalive_worker(struct work_struct *work)
+{
+	struct rxrpc_net *rxnet =
+		container_of(work, struct rxrpc_net, peer_keepalive_work);
+	struct rxrpc_peer *peer;
+	unsigned long delay;
+	ktime_t base, now = ktime_get_real();
+	s64 diff;
+	u8 cursor, slot;
+
+	base = rxnet->peer_keepalive_base;
+	cursor = rxnet->peer_keepalive_cursor;
+
+	_enter("%u,%lld", cursor, ktime_sub(now, base));
+
+next_bucket:
+	diff = ktime_to_ns(ktime_sub(now, base));
+	if (diff < 0)
+		goto resched;
+
+	_debug("at %u", cursor);
+	spin_lock_bh(&rxnet->peer_hash_lock);
+next_peer:
+	if (!rxnet->live) {
+		spin_unlock_bh(&rxnet->peer_hash_lock);
+		goto out;
+	}
+
+	/* Everything in the bucket at the cursor is processed this second; the
+	 * bucket at cursor + 1 goes now + 1s and so on...
+	 */
+	if (hlist_empty(&rxnet->peer_keepalive[cursor])) {
+		if (hlist_empty(&rxnet->peer_keepalive_new)) {
+			spin_unlock_bh(&rxnet->peer_hash_lock);
+			goto emptied_bucket;
+		}
+
+		hlist_move_list(&rxnet->peer_keepalive_new,
+				&rxnet->peer_keepalive[cursor]);
+	}
+
+	peer = hlist_entry(rxnet->peer_keepalive[cursor].first,
+			   struct rxrpc_peer, keepalive_link);
+	hlist_del_init(&peer->keepalive_link);
+	if (!rxrpc_get_peer_maybe(peer))
+		goto next_peer;
+
+	spin_unlock_bh(&rxnet->peer_hash_lock);
+
+	_debug("peer %u {%pISp}", peer->debug_id, &peer->srx.transport);
+
+recalc:
+	diff = ktime_divns(ktime_sub(peer->last_tx_at, base), NSEC_PER_SEC);
+	if (diff < -30 || diff > 30)
+		goto send; /* LSW of 64-bit time probably wrapped on 32-bit */
+	diff += RXRPC_KEEPALIVE_TIME - 1;
+	if (diff < 0)
+		goto send;
+
+	slot = (diff > RXRPC_KEEPALIVE_TIME - 1) ? RXRPC_KEEPALIVE_TIME - 1 : diff;
+	if (slot == 0)
+		goto send;
+
+	/* A transmission to this peer occurred since last we examined it so
+	 * put it into the appropriate future bucket.
+	 */
+	slot = (slot + cursor) % ARRAY_SIZE(rxnet->peer_keepalive);
+	spin_lock_bh(&rxnet->peer_hash_lock);
+	hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive[slot]);
+	rxrpc_put_peer(peer);
+	goto next_peer;
+
+send:
+	rxrpc_send_keepalive(peer);
+	now = ktime_get_real();
+	goto recalc;
+
+emptied_bucket:
+	cursor++;
+	if (cursor >= ARRAY_SIZE(rxnet->peer_keepalive))
+		cursor = 0;
+	base = ktime_add_ns(base, NSEC_PER_SEC);
+	goto next_bucket;
+
+resched:
+	rxnet->peer_keepalive_base = base;
+	rxnet->peer_keepalive_cursor = cursor;
+	delay = nsecs_to_jiffies(-diff) + 1;
+	timer_reduce(&rxnet->peer_keepalive_timer, jiffies + delay);
+out:
+	_leave("");
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index d02a99f37f5f..94a6dbfcf129 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -322,6 +322,7 @@ struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *local,
 	if (!peer) {
 		peer = prealloc;
 		hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key);
+		hlist_add_head(&peer->keepalive_link, &rxnet->peer_keepalive_new);
 	}
 
 	spin_unlock(&rxnet->peer_hash_lock);
@@ -363,9 +364,12 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 		peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key);
 		if (peer && !rxrpc_get_peer_maybe(peer))
 			peer = NULL;
-		if (!peer)
+		if (!peer) {
 			hash_add_rcu(rxnet->peer_hash,
 				     &candidate->hash_link, hash_key);
+			hlist_add_head(&candidate->keepalive_link,
+				       &rxnet->peer_keepalive_new);
+		}
 
 		spin_unlock_bh(&rxnet->peer_hash_lock);
 
@@ -392,6 +396,7 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer)
 
 	spin_lock_bh(&rxnet->peer_hash_lock);
 	hash_del_rcu(&peer->hash_link);
+	hlist_del_init(&peer->keepalive_link);
 	spin_unlock_bh(&rxnet->peer_hash_lock);
 
 	kfree_rcu(peer, rcu);
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 77cb23c7bd0a..588fea0dd362 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -668,6 +668,7 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
 		return -EAGAIN;
 	}
 
+	conn->params.peer->last_tx_at = ktime_get_real();
 	_leave(" = 0");
 	return 0;
 }
@@ -722,6 +723,7 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
 		return -EAGAIN;
 	}
 
+	conn->params.peer->last_tx_at = ktime_get_real();
 	_leave(" = 0");
 	return 0;
 }
-- 
cgit v1.2.3


From f82eb88b0fa6943f58760fd7c3d1b12c1f9cf492 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:04:43 +0100
Subject: rxrpc: Fix a bit of time confusion

The rxrpc_reduce_call_timer() function should be passed the 'current time'
in jiffies, not the current ktime time.  It's confusing in rxrpc_resend
because that has to deal with both.  Pass the correct current time in.

Note that this only affects the trace produced and not the functioning of
the code.

Fixes: a158bdd3247b ("rxrpc: Fix call timeouts")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6a62e42e1d8d..3dee89c7a06e 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -238,7 +238,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 	 * retransmitting data.
 	 */
 	if (!retrans) {
-		rxrpc_reduce_call_timer(call, resend_at, now,
+		rxrpc_reduce_call_timer(call, resend_at, now_j,
 					rxrpc_timer_set_for_resend);
 		spin_unlock_bh(&call->lock);
 		ack_ts = ktime_sub(now, call->acks_latest_ts);
-- 
cgit v1.2.3


From 03877bf6a30cca7d4bc3ffabd3c3e9464a7a1a19 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:04:43 +0100
Subject: rxrpc: Fix Tx ring annotation after initial Tx failure

rxrpc calls have a ring of packets that are awaiting ACK or retransmission
and a parallel ring of annotations that tracks the state of those packets.
If the initial transmission of a packet on the underlying UDP socket fails
then the packet annotation is marked for resend - but the setting of this
mark accidentally erases the last-packet mark also stored in the same
annotation slot.  If this happens, a call won't switch out of the Tx phase
when all the packets have been transmitted.

Fix this by retaining the last-packet mark and only altering the packet
state.

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/sendmsg.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 8503f279b467..783c777fc6e7 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -130,7 +130,9 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix)
 	spin_lock_bh(&call->lock);
 
 	if (call->state < RXRPC_CALL_COMPLETE) {
-		call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+		call->rxtx_annotations[ix] =
+			(call->rxtx_annotations[ix] & RXRPC_TX_ANNO_LAST) |
+			RXRPC_TX_ANNO_RETRANS;
 		if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events))
 			rxrpc_queue_call(call);
 	}
-- 
cgit v1.2.3


From 57b0c9d49b94bbeb53649b7fbd264603c1ebd585 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:04:44 +0100
Subject: rxrpc: Don't treat call aborts as conn aborts

If a call-level abort is received for the previous call to complete on a
connection channel, then that abort is queued for the connection processor
to handle.  Unfortunately, the connection processor then assumes without
checking that the abort is connection-level (ie. callNumber is 0) and
distributes it over all active calls on that connection, thereby
incorrectly aborting them.

Fix this by discarding aborts aimed at a completed call.

Further, discard all packets aimed at a call that's complete if there's
currently an active call on a channel, since the DATA packets associated
with the new call automatically terminate the old call.

Fixes: 18bfeba50dfd ("rxrpc: Perform terminal call ACK/ABORT retransmission from conn processor")
Reported-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/input.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index d4f2509e018b..21800e6f5019 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1242,16 +1242,19 @@ void rxrpc_data_ready(struct sock *udp_sk)
 			goto discard_unlock;
 
 		if (sp->hdr.callNumber == chan->last_call) {
-			/* For the previous service call, if completed successfully, we
-			 * discard all further packets.
+			if (chan->call ||
+			    sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)
+				goto discard_unlock;
+
+			/* For the previous service call, if completed
+			 * successfully, we discard all further packets.
 			 */
 			if (rxrpc_conn_is_service(conn) &&
-			    (chan->last_type == RXRPC_PACKET_TYPE_ACK ||
-			     sp->hdr.type == RXRPC_PACKET_TYPE_ABORT))
+			    chan->last_type == RXRPC_PACKET_TYPE_ACK)
 				goto discard_unlock;
 
-			/* But otherwise we need to retransmit the final packet from
-			 * data cached in the connection record.
+			/* But otherwise we need to retransmit the final packet
+			 * from data cached in the connection record.
 			 */
 			rxrpc_post_packet_to_conn(conn, skb);
 			goto out_unlock;
-- 
cgit v1.2.3


From 59299aa1028fce051adbd25aaff7c387b865cd6d Mon Sep 17 00:00:00 2001
From: Marc Dionne <marc.dionne@auristor.com>
Date: Fri, 30 Mar 2018 21:04:44 +0100
Subject: rxrpc: Fix resend event time calculation

Commit a158bdd3 ("rxrpc: Fix call timeouts") reworked the time calculation
for the next resend event.  For this calculation, "oldest" will be before
"now", so ktime_sub(oldest, now) will yield a negative value.  When passed
to nsecs_to_jiffies which expects an unsigned value, the end result will be
a very large value, and a resend event scheduled far into the future.  This
could cause calls to stall if some packets were lost.

Fix by ordering the arguments to ktime_sub correctly.

Fixes: a158bdd3247b ("rxrpc: Fix call timeouts")
Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 3dee89c7a06e..6e0d788b4dc4 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -226,7 +226,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
 				       ktime_to_ns(ktime_sub(skb->tstamp, max_age)));
 	}
 
-	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now)));
+	resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest)));
 	resend_at += jiffies + rxrpc_resend_timeout;
 	WRITE_ONCE(call->resend_at, resend_at);
 
-- 
cgit v1.2.3


From edb63e2b271752a9424a3d33cfcd4f434a020f9b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 30 Mar 2018 21:04:44 +0100
Subject: rxrpc: remove unused static variables

The rxrpc_security_methods and rxrpc_security_sem user has been removed
in 648af7fca159 ("rxrpc: Absorb the rxkad security module"). This was
noticed by kbuild test robot for the -RT tree but is also true for !RT.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/security.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index e9f428351293..c4479afe8ae7 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -19,9 +19,6 @@
 #include <keys/rxrpc-type.h>
 #include "ar-internal.h"
 
-static LIST_HEAD(rxrpc_security_methods);
-static DECLARE_RWSEM(rxrpc_security_sem);
-
 static const struct rxrpc_security *rxrpc_security_types[] = {
 	[RXRPC_SECURITY_NONE]	= &rxrpc_no_security,
 #ifdef CONFIG_RXKAD
-- 
cgit v1.2.3


From 88f2a8257c9aa7df957b1a79a104f348d60d8027 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:05:17 +0100
Subject: rxrpc: Fix checker warnings and errors

Fix various issues detected by checker.

Errors:

 (*) rxrpc_discard_prealloc() should be using rcu_assign_pointer to set
     call->socket.

Warnings:

 (*) rxrpc_service_connection_reaper() should be passing NULL rather than 0 to
     trace_rxrpc_conn() as the where argument.

 (*) rxrpc_disconnect_client_call() should get its net pointer via the
     call->conn rather than call->sock to avoid a warning about accessing
     an RCU pointer without protection.

 (*) Proc seq start/stop functions need annotation as they pass locks
     between the functions.

False positives:

 (*) Checker doesn't correctly handle of seq-retry lock context balance in
     rxrpc_find_service_conn_rcu().

 (*) Checker thinks execution may proceed past the BUG() in
     rxrpc_publish_service_conn().

 (*) Variable length array warnings from SKCIPHER_REQUEST_ON_STACK() in
     rxkad.c.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/call_accept.c | 3 ++-
 net/rxrpc/call_object.c | 1 +
 net/rxrpc/conn_client.c | 2 +-
 net/rxrpc/conn_object.c | 2 +-
 net/rxrpc/proc.c        | 6 ++++++
 net/rxrpc/sendmsg.c     | 2 ++
 6 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 92ebd1d7e0bb..4ce24c000653 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -225,7 +225,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 	tail = b->call_backlog_tail;
 	while (CIRC_CNT(head, tail, size) > 0) {
 		struct rxrpc_call *call = b->call_backlog[tail];
-		call->socket = rx;
+		rcu_assign_pointer(call->socket, rx);
 		if (rx->discard_new_call) {
 			_debug("discard %lx", call->user_call_ID);
 			rx->discard_new_call(call, call->user_call_ID);
@@ -456,6 +456,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 				     unsigned long user_call_ID,
 				     rxrpc_notify_rx_t notify_rx)
 	__releases(&rx->sk.sk_lock.slock)
+	__acquires(call->user_mutex)
 {
 	struct rxrpc_call *call;
 	struct rb_node *parent, **pp;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 147657dfe757..85b12c472522 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -219,6 +219,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 					 gfp_t gfp,
 					 unsigned int debug_id)
 	__releases(&rx->sk.sk_lock.slock)
+	__acquires(&call->user_mutex)
 {
 	struct rxrpc_call *call, *xcall;
 	struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk));
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 064175068059..041da40dbf93 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -776,7 +776,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
 	unsigned int channel = call->cid & RXRPC_CHANNELMASK;
 	struct rxrpc_connection *conn = call->conn;
 	struct rxrpc_channel *chan = &conn->channels[channel];
-	struct rxrpc_net *rxnet = rxrpc_net(sock_net(&call->socket->sk));
+	struct rxrpc_net *rxnet = conn->params.local->rxnet;
 
 	trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
 	call->conn = NULL;
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index ccbac190add1..bfc46fd69a62 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -418,7 +418,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work)
 		 */
 		if (atomic_cmpxchg(&conn->usage, 1, 0) != 1)
 			continue;
-		trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0);
+		trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL);
 
 		if (rxrpc_conn_is_client(conn))
 			BUG();
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index f79f260c6ddc..7e45db058823 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -29,6 +29,8 @@ static const char *const rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
  * generate a list of extant and dead calls in /proc/net/rxrpc_calls
  */
 static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
+	__acquires(rcu)
+	__acquires(rxnet->call_lock)
 {
 	struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
 
@@ -45,6 +47,8 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
+	__releases(rxnet->call_lock)
+	__releases(rcu)
 {
 	struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
 
@@ -135,6 +139,7 @@ const struct file_operations rxrpc_call_seq_fops = {
  * generate a list of extant virtual connections in /proc/net/rxrpc_conns
  */
 static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t *_pos)
+	__acquires(rxnet->conn_lock)
 {
 	struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
 
@@ -151,6 +156,7 @@ static void *rxrpc_connection_seq_next(struct seq_file *seq, void *v,
 }
 
 static void rxrpc_connection_seq_stop(struct seq_file *seq, void *v)
+	__releases(rxnet->conn_lock)
 {
 	struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
 
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 783c777fc6e7..a62980a80151 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -556,6 +556,7 @@ static struct rxrpc_call *
 rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
 				  struct rxrpc_send_params *p)
 	__releases(&rx->sk.sk_lock.slock)
+	__acquires(&call->user_mutex)
 {
 	struct rxrpc_conn_parameters cp;
 	struct rxrpc_call *call;
@@ -596,6 +597,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
  */
 int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
 	__releases(&rx->sk.sk_lock.slock)
+	__releases(&call->user_mutex)
 {
 	enum rxrpc_call_state state;
 	struct rxrpc_call *call;
-- 
cgit v1.2.3


From d3be4d244330f7ef53242d8dc1b7f77d105e767f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:05:23 +0100
Subject: rxrpc: Fix potential call vs socket/net destruction race

rxrpc_call structs don't pin sockets or network namespaces, but may attempt
to access both after their refcount reaches 0 so that they can detach
themselves from the network namespace.  However, there's no guarantee that
the socket still exists at this point (so sock_net(&call->socket->sk) may
be invalid) and the namespace may have gone away if the call isn't pinning
a peer.

Fix this by (a) carrying a net pointer in the rxrpc_call struct and (b)
waiting for all calls to be destroyed when the network namespace goes away.

This was detected by checker:

net/rxrpc/call_object.c:634:57: warning: incorrect type in argument 1 (different address spaces)
net/rxrpc/call_object.c:634:57:    expected struct sock const *sk
net/rxrpc/call_object.c:634:57:    got struct sock [noderef] <asn:4>*<noident>

Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h |  2 ++
 net/rxrpc/call_accept.c |  1 +
 net/rxrpc/call_object.c | 16 +++++++++++++---
 net/rxrpc/net_ns.c      |  1 +
 4 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 8a348e0a9d95..2a2b0fdfb157 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -75,6 +75,7 @@ struct rxrpc_net {
 	u32			epoch;		/* Local epoch for detecting local-end reset */
 	struct list_head	calls;		/* List of calls active in this namespace */
 	rwlock_t		call_lock;	/* Lock for ->calls */
+	atomic_t		nr_calls;	/* Count of allocated calls */
 
 	struct list_head	conn_proc_list;	/* List of conns in this namespace for proc */
 	struct list_head	service_conns;	/* Service conns in this namespace */
@@ -528,6 +529,7 @@ struct rxrpc_call {
 	struct rxrpc_connection	*conn;		/* connection carrying call */
 	struct rxrpc_peer	*peer;		/* Peer record for remote address */
 	struct rxrpc_sock __rcu	*socket;	/* socket responsible */
+	struct rxrpc_net	*rxnet;		/* Network namespace to which call belongs */
 	struct mutex		user_mutex;	/* User access mutex */
 	unsigned long		ack_at;		/* When deferred ACK needs to happen */
 	unsigned long		ack_lost_at;	/* When ACK is figured as lost */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 4ce24c000653..493545033e42 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -138,6 +138,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 
 	write_unlock(&rx->call_lock);
 
+	rxnet = call->rxnet;
 	write_lock(&rxnet->call_lock);
 	list_add_tail(&call->link, &rxnet->calls);
 	write_unlock(&rxnet->call_lock);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 85b12c472522..f721c2b7e234 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -103,6 +103,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
 				    unsigned int debug_id)
 {
 	struct rxrpc_call *call;
+	struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk));
 
 	call = kmem_cache_zalloc(rxrpc_call_jar, gfp);
 	if (!call)
@@ -153,6 +154,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
 
 	call->cong_cwnd = 2;
 	call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
+
+	call->rxnet = rxnet;
+	atomic_inc(&rxnet->nr_calls);
 	return call;
 
 nomem_2:
@@ -222,7 +226,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 	__acquires(&call->user_mutex)
 {
 	struct rxrpc_call *call, *xcall;
-	struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk));
+	struct rxrpc_net *rxnet;
 	struct rb_node *parent, **pp;
 	const void *here = __builtin_return_address(0);
 	int ret;
@@ -272,6 +276,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
 
 	write_unlock(&rx->call_lock);
 
+	rxnet = call->rxnet;
 	write_lock(&rxnet->call_lock);
 	list_add_tail(&call->link, &rxnet->calls);
 	write_unlock(&rxnet->call_lock);
@@ -617,7 +622,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
  */
 void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
-	struct rxrpc_net *rxnet;
+	struct rxrpc_net *rxnet = call->rxnet;
 	const void *here = __builtin_return_address(0);
 	int n;
 
@@ -631,7 +636,6 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 		ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
 		if (!list_empty(&call->link)) {
-			rxnet = rxrpc_net(sock_net(&call->socket->sk));
 			write_lock(&rxnet->call_lock);
 			list_del_init(&call->link);
 			write_unlock(&rxnet->call_lock);
@@ -647,11 +651,14 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 static void rxrpc_rcu_destroy_call(struct rcu_head *rcu)
 {
 	struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu);
+	struct rxrpc_net *rxnet = call->rxnet;
 
 	rxrpc_put_peer(call->peer);
 	kfree(call->rxtx_buffer);
 	kfree(call->rxtx_annotations);
 	kmem_cache_free(rxrpc_call_jar, call);
+	if (atomic_dec_and_test(&rxnet->nr_calls))
+		wake_up_atomic_t(&rxnet->nr_calls);
 }
 
 /*
@@ -716,4 +723,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet)
 	}
 
 	write_unlock(&rxnet->call_lock);
+
+	atomic_dec(&rxnet->nr_calls);
+	wait_on_atomic_t(&rxnet->nr_calls, atomic_t_wait, TASK_UNINTERRUPTIBLE);
 }
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 66baf2b80b6c..101019b0be34 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -55,6 +55,7 @@ static __net_init int rxrpc_init_net(struct net *net)
 
 	INIT_LIST_HEAD(&rxnet->calls);
 	rwlock_init(&rxnet->call_lock);
+	atomic_set(&rxnet->nr_calls, 1);
 
 	INIT_LIST_HEAD(&rxnet->conn_proc_list);
 	INIT_LIST_HEAD(&rxnet->service_conns);
-- 
cgit v1.2.3


From 09d2bf595db4b4075ea721acd61e180d6bb18f88 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:05:28 +0100
Subject: rxrpc: Add a tracepoint to track rxrpc_local refcounting

Add a tracepoint to track reference counting on the rxrpc_local struct.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h  | 27 +++-----------------
 net/rxrpc/call_accept.c  |  3 +--
 net/rxrpc/local_object.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 2a2b0fdfb157..cc51d3eb0548 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -981,31 +981,12 @@ extern void rxrpc_process_local_events(struct rxrpc_local *);
  * local_object.c
  */
 struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *);
-void __rxrpc_put_local(struct rxrpc_local *);
+struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *);
+struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *);
+void rxrpc_put_local(struct rxrpc_local *);
+void rxrpc_queue_local(struct rxrpc_local *);
 void rxrpc_destroy_all_locals(struct rxrpc_net *);
 
-static inline void rxrpc_get_local(struct rxrpc_local *local)
-{
-	atomic_inc(&local->usage);
-}
-
-static inline
-struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
-{
-	return atomic_inc_not_zero(&local->usage) ? local : NULL;
-}
-
-static inline void rxrpc_put_local(struct rxrpc_local *local)
-{
-	if (local && atomic_dec_and_test(&local->usage))
-		__rxrpc_put_local(local);
-}
-
-static inline void rxrpc_queue_local(struct rxrpc_local *local)
-{
-	rxrpc_queue_work(&local->processor);
-}
-
 /*
  * misc.c
  */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 493545033e42..5a9b1d916124 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -296,8 +296,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx,
 		b->conn_backlog[conn_tail] = NULL;
 		smp_store_release(&b->conn_backlog_tail,
 				  (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
-		rxrpc_get_local(local);
-		conn->params.local = local;
+		conn->params.local = rxrpc_get_local(local);
 		conn->params.peer = peer;
 		rxrpc_see_connection(conn);
 		rxrpc_new_incoming_connection(rx, conn, skb);
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 38b99db30e54..8b54e9531d52 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -95,6 +95,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet,
 		local->debug_id = atomic_inc_return(&rxrpc_debug_id);
 		memcpy(&local->srx, srx, sizeof(*srx));
 		local->srx.srx_service = 0;
+		trace_rxrpc_local(local, rxrpc_local_new, 1, NULL);
 	}
 
 	_leave(" = %p", local);
@@ -256,15 +257,74 @@ addr_in_use:
 	return ERR_PTR(-EADDRINUSE);
 }
 
+/*
+ * Get a ref on a local endpoint.
+ */
+struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local)
+{
+	const void *here = __builtin_return_address(0);
+	int n;
+
+	n = atomic_inc_return(&local->usage);
+	trace_rxrpc_local(local, rxrpc_local_got, n, here);
+	return local;
+}
+
+/*
+ * Get a ref on a local endpoint unless its usage has already reached 0.
+ */
+struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
+{
+	const void *here = __builtin_return_address(0);
+
+	if (local) {
+		int n = __atomic_add_unless(&local->usage, 1, 0);
+		if (n > 0)
+			trace_rxrpc_local(local, rxrpc_local_got, n + 1, here);
+		else
+			local = NULL;
+	}
+	return local;
+}
+
+/*
+ * Queue a local endpoint.
+ */
+void rxrpc_queue_local(struct rxrpc_local *local)
+{
+	const void *here = __builtin_return_address(0);
+
+	if (rxrpc_queue_work(&local->processor))
+		trace_rxrpc_local(local, rxrpc_local_queued,
+				  atomic_read(&local->usage), here);
+}
+
 /*
  * A local endpoint reached its end of life.
  */
-void __rxrpc_put_local(struct rxrpc_local *local)
+static void __rxrpc_put_local(struct rxrpc_local *local)
 {
 	_enter("%d", local->debug_id);
 	rxrpc_queue_work(&local->processor);
 }
 
+/*
+ * Drop a ref on a local endpoint.
+ */
+void rxrpc_put_local(struct rxrpc_local *local)
+{
+	const void *here = __builtin_return_address(0);
+	int n;
+
+	if (local) {
+		n = atomic_dec_return(&local->usage);
+		trace_rxrpc_local(local, rxrpc_local_put, n, here);
+
+		if (n == 0)
+			__rxrpc_put_local(local);
+	}
+}
+
 /*
  * Destroy a local endpoint's socket and then hand the record to RCU to dispose
  * of.
@@ -322,7 +382,8 @@ static void rxrpc_local_processor(struct work_struct *work)
 		container_of(work, struct rxrpc_local, processor);
 	bool again;
 
-	_enter("%d", local->debug_id);
+	trace_rxrpc_local(local, rxrpc_local_processing,
+			  atomic_read(&local->usage), NULL);
 
 	do {
 		again = false;
-- 
cgit v1.2.3


From 31f5f9a1691ebef2113c8bdb3edcb8859f30f702 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:05:33 +0100
Subject: rxrpc: Fix apparent leak of rxrpc_local objects

rxrpc_local objects cannot be disposed of until all the connections that
point to them have been RCU'd as a connection object holds refcount on the
local endpoint it is communicating through.  Currently, this can cause an
assertion failure to occur when a network namespace is destroyed as there's
no check that the RCU destructors for the connections have been run before
we start trying to destroy local endpoints.

The kernel reports:

	rxrpc: AF_RXRPC: Leaked local 0000000036a41bc1 {5}
	------------[ cut here ]------------
	kernel BUG at ../net/rxrpc/local_object.c:439!

Fix this by keeping a count of the live connections and waiting for it to
go to zero at the end of rxrpc_destroy_all_connections().

Fixes: dee46364ce6f ("rxrpc: Add RCU destruction for connections and calls")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h  | 1 +
 net/rxrpc/call_accept.c  | 2 ++
 net/rxrpc/conn_client.c  | 1 +
 net/rxrpc/conn_object.c  | 8 ++++++++
 net/rxrpc/conn_service.c | 1 +
 net/rxrpc/net_ns.c       | 1 +
 6 files changed, 14 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index cc51d3eb0548..d40d54b78567 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -77,6 +77,7 @@ struct rxrpc_net {
 	rwlock_t		call_lock;	/* Lock for ->calls */
 	atomic_t		nr_calls;	/* Count of allocated calls */
 
+	atomic_t		nr_conns;
 	struct list_head	conn_proc_list;	/* List of conns in this namespace for proc */
 	struct list_head	service_conns;	/* Service conns in this namespace */
 	rwlock_t		conn_lock;	/* Lock for ->conn_proc_list, ->service_conns */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 5a9b1d916124..f67017dcb25e 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -219,6 +219,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
 		list_del(&conn->proc_link);
 		write_unlock(&rxnet->conn_lock);
 		kfree(conn);
+		if (atomic_dec_and_test(&rxnet->nr_conns))
+			wake_up_atomic_t(&rxnet->nr_conns);
 		tail = (tail + 1) & (size - 1);
 	}
 
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 041da40dbf93..5736f643c516 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -207,6 +207,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters *cp, gfp_t gfp)
 	if (ret < 0)
 		goto error_2;
 
+	atomic_inc(&rxnet->nr_conns);
 	write_lock(&rxnet->conn_lock);
 	list_add_tail(&conn->proc_link, &rxnet->conn_proc_list);
 	write_unlock(&rxnet->conn_lock);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index bfc46fd69a62..0950ee3d26f5 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -365,6 +365,9 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
 	key_put(conn->params.key);
 	key_put(conn->server_key);
 	rxrpc_put_peer(conn->params.peer);
+
+	if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns))
+		wake_up_atomic_t(&conn->params.local->rxnet->nr_conns);
 	rxrpc_put_local(conn->params.local);
 
 	kfree(conn);
@@ -458,6 +461,7 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 
 	_enter("");
 
+	atomic_dec(&rxnet->nr_conns);
 	rxrpc_destroy_all_client_connections(rxnet);
 
 	del_timer_sync(&rxnet->service_conn_reap_timer);
@@ -475,5 +479,9 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 
 	ASSERT(list_empty(&rxnet->conn_proc_list));
 
+	/* We need to wait for the connections to be destroyed by RCU as they
+	 * pin things that we still need to get rid of.
+	 */
+	wait_on_atomic_t(&rxnet->nr_conns, atomic_t_wait, TASK_UNINTERRUPTIBLE);
 	_leave("");
 }
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index f6fcdb3130a1..80773a50c755 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -132,6 +132,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
 		conn->state = RXRPC_CONN_SERVICE_PREALLOC;
 		atomic_set(&conn->usage, 2);
 
+		atomic_inc(&rxnet->nr_conns);
 		write_lock(&rxnet->conn_lock);
 		list_add_tail(&conn->link, &rxnet->service_conns);
 		list_add_tail(&conn->proc_link, &rxnet->conn_proc_list);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 101019b0be34..fa9ce60e7bfa 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -57,6 +57,7 @@ static __net_init int rxrpc_init_net(struct net *net)
 	rwlock_init(&rxnet->call_lock);
 	atomic_set(&rxnet->nr_calls, 1);
 
+	atomic_set(&rxnet->nr_conns, 1);
 	INIT_LIST_HEAD(&rxnet->conn_proc_list);
 	INIT_LIST_HEAD(&rxnet->service_conns);
 	rwlock_init(&rxnet->conn_lock);
-- 
cgit v1.2.3


From 1159d4b496f57d5b8ee27c8b90b9d01c332e2e11 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:05:38 +0100
Subject: rxrpc: Add a tracepoint to track rxrpc_peer refcounting

Add a tracepoint to track reference counting on the rxrpc_peer struct.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/ar-internal.h | 23 +++--------------
 net/rxrpc/peer_event.c  |  2 +-
 net/rxrpc/peer_object.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 68 insertions(+), 22 deletions(-)

(limited to 'net')

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index d40d54b78567..c46583bc255d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -1041,25 +1041,10 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
 struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
 struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
 					      struct rxrpc_peer *);
-
-static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
-{
-	atomic_inc(&peer->usage);
-	return peer;
-}
-
-static inline
-struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
-{
-	return atomic_inc_not_zero(&peer->usage) ? peer : NULL;
-}
-
-extern void __rxrpc_put_peer(struct rxrpc_peer *peer);
-static inline void rxrpc_put_peer(struct rxrpc_peer *peer)
-{
-	if (peer && atomic_dec_and_test(&peer->usage))
-		__rxrpc_put_peer(peer);
-}
+struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *);
+struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *);
+void rxrpc_put_peer(struct rxrpc_peer *);
+void __rxrpc_queue_peer_error(struct rxrpc_peer *);
 
 /*
  * proc.c
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index d01eb9a06448..78c2f95d1f22 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -192,7 +192,7 @@ void rxrpc_error_report(struct sock *sk)
 	rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 
 	/* The ref we obtained is passed off to the work item */
-	rxrpc_queue_work(&peer->error_distributor);
+	__rxrpc_queue_peer_error(peer);
 	_leave("");
 }
 
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 94a6dbfcf129..a4a750aea1e5 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -386,9 +386,54 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local,
 }
 
 /*
- * Discard a ref on a remote peer record.
+ * Get a ref on a peer record.
  */
-void __rxrpc_put_peer(struct rxrpc_peer *peer)
+struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
+{
+	const void *here = __builtin_return_address(0);
+	int n;
+
+	n = atomic_inc_return(&peer->usage);
+	trace_rxrpc_peer(peer, rxrpc_peer_got, n, here);
+	return peer;
+}
+
+/*
+ * Get a ref on a peer record unless its usage has already reached 0.
+ */
+struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
+{
+	const void *here = __builtin_return_address(0);
+
+	if (peer) {
+		int n = __atomic_add_unless(&peer->usage, 1, 0);
+		if (n > 0)
+			trace_rxrpc_peer(peer, rxrpc_peer_got, n + 1, here);
+		else
+			peer = NULL;
+	}
+	return peer;
+}
+
+/*
+ * Queue a peer record.  This passes the caller's ref to the workqueue.
+ */
+void __rxrpc_queue_peer_error(struct rxrpc_peer *peer)
+{
+	const void *here = __builtin_return_address(0);
+	int n;
+
+	n = atomic_read(&peer->usage);
+	if (rxrpc_queue_work(&peer->error_distributor))
+		trace_rxrpc_peer(peer, rxrpc_peer_queued_error, n, here);
+	else
+		rxrpc_put_peer(peer);
+}
+
+/*
+ * Discard a peer record.
+ */
+static void __rxrpc_put_peer(struct rxrpc_peer *peer)
 {
 	struct rxrpc_net *rxnet = peer->local->rxnet;
 
@@ -402,6 +447,22 @@ void __rxrpc_put_peer(struct rxrpc_peer *peer)
 	kfree_rcu(peer, rcu);
 }
 
+/*
+ * Drop a ref on a peer record.
+ */
+void rxrpc_put_peer(struct rxrpc_peer *peer)
+{
+	const void *here = __builtin_return_address(0);
+	int n;
+
+	if (peer) {
+		n = atomic_dec_return(&peer->usage);
+		trace_rxrpc_peer(peer, rxrpc_peer_put, n, here);
+		if (n == 0)
+			__rxrpc_put_peer(peer);
+	}
+}
+
 /**
  * rxrpc_kernel_get_peer - Get the peer address of a call
  * @sock: The socket on which the call is in progress.
-- 
cgit v1.2.3


From 17226f1240381812c3a4927dc9da2814fb71c8ac Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 30 Mar 2018 21:05:44 +0100
Subject: rxrpc: Fix leak of rxrpc_peer objects

When a new client call is requested, an rxrpc_conn_parameters struct object
is passed in with a bunch of parameters set, such as the local endpoint to
use.  A pointer to the target peer record is also placed in there by
rxrpc_get_client_conn() - and this is removed if and only if a new
connection object is allocated.  Thus it leaks if a new connection object
isn't allocated.

Fix this by putting any peer object attached to the rxrpc_conn_parameters
object in the function that allocated it.

Fixes: 19ffa01c9c45 ("rxrpc: Use structs to hold connection params and protocol info")
Signed-off-by: David Howells <dhowells@redhat.com>
---
 net/rxrpc/af_rxrpc.c    |  2 ++
 net/rxrpc/ar-internal.h |  1 +
 net/rxrpc/net_ns.c      |  1 +
 net/rxrpc/peer_object.c | 21 +++++++++++++++++++++
 net/rxrpc/sendmsg.c     |  1 +
 5 files changed, 26 insertions(+)

(limited to 'net')

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 0b3026b8fa40..9a2c8e7c000e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -324,6 +324,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
 		mutex_unlock(&call->user_mutex);
 	}
 
+	rxrpc_put_peer(cp.peer);
 	_leave(" = %p", call);
 	return call;
 }
@@ -447,6 +448,7 @@ int rxrpc_kernel_retry_call(struct socket *sock, struct rxrpc_call *call,
 		ret = rxrpc_retry_client_call(rx, call, &cp, srx, GFP_KERNEL);
 
 	mutex_unlock(&call->user_mutex);
+	rxrpc_put_peer(cp.peer);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index c46583bc255d..90d7079e0aa9 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -1041,6 +1041,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
 struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
 struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
 					      struct rxrpc_peer *);
+void rxrpc_destroy_all_peers(struct rxrpc_net *);
 struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *);
 struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *);
 void rxrpc_put_peer(struct rxrpc_peer *);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index fa9ce60e7bfa..c7a023fb22d0 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -118,6 +118,7 @@ static __net_exit void rxrpc_exit_net(struct net *net)
 	cancel_work_sync(&rxnet->peer_keepalive_work);
 	rxrpc_destroy_all_calls(rxnet);
 	rxrpc_destroy_all_connections(rxnet);
+	rxrpc_destroy_all_peers(rxnet);
 	rxrpc_destroy_all_locals(rxnet);
 	proc_remove(rxnet->proc_net);
 }
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index a4a750aea1e5..1b7e8107b3ae 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -463,6 +463,27 @@ void rxrpc_put_peer(struct rxrpc_peer *peer)
 	}
 }
 
+/*
+ * Make sure all peer records have been discarded.
+ */
+void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet)
+{
+	struct rxrpc_peer *peer;
+	int i;
+
+	for (i = 0; i < HASH_SIZE(rxnet->peer_hash); i++) {
+		if (hlist_empty(&rxnet->peer_hash[i]))
+			continue;
+
+		hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) {
+			pr_err("Leaked peer %u {%u} %pISp\n",
+			       peer->debug_id,
+			       atomic_read(&peer->usage),
+			       &peer->srx.transport);
+		}
+	}
+}
+
 /**
  * rxrpc_kernel_get_peer - Get the peer address of a call
  * @sock: The socket on which the call is in progress.
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index a62980a80151..206e802ccbdc 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -586,6 +586,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
 				     atomic_inc_return(&rxrpc_debug_id));
 	/* The socket is now unlocked */
 
+	rxrpc_put_peer(cp.peer);
 	_leave(" = %p\n", call);
 	return call;
 }
-- 
cgit v1.2.3


From 5e43f899b03a3492ce5fc44e8900becb04dae9c0 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:00 -0700
Subject: bpf: Check attach type at prog load time

== The problem ==

There are use-cases when a program of some type can be attached to
multiple attach points and those attach points must have different
permissions to access context or to call helpers.

E.g. context structure may have fields for both IPv4 and IPv6 but it
doesn't make sense to read from / write to IPv6 field when attach point
is somewhere in IPv4 stack.

Same applies to BPF-helpers: it may make sense to call some helper from
some attach point, but not from other for same prog type.

== The solution ==

Introduce `expected_attach_type` field in in `struct bpf_attr` for
`BPF_PROG_LOAD` command. If scenario described in "The problem" section
is the case for some prog type, the field will be checked twice:

1) At load time prog type is checked to see if attach type for it must
   be known to validate program permissions correctly. Prog will be
   rejected with EINVAL if it's the case and `expected_attach_type` is
   not specified or has invalid value.

2) At attach time `attach_type` is compared with `expected_attach_type`,
   if prog type requires to have one, and, if they differ, attach will
   be rejected with EINVAL.

The `expected_attach_type` is now available as part of `struct bpf_prog`
in both `bpf_verifier_ops->is_valid_access()` and
`bpf_verifier_ops->get_func_proto()` () and can be used to check context
accesses and calls to helpers correspondingly.

Initially the idea was discussed by Alexei Starovoitov <ast@fb.com> and
Daniel Borkmann <daniel@iogearbox.net> here:
https://marc.info/?l=linux-netdev&m=152107378717201&w=2

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index e989bf313195..7790fd128614 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3685,7 +3685,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
-sock_filter_func_proto(enum bpf_func_id func_id)
+sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	/* inet and inet6 sockets are created in a process
@@ -3699,7 +3699,7 @@ sock_filter_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
-sk_filter_func_proto(enum bpf_func_id func_id)
+sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_skb_load_bytes:
@@ -3714,7 +3714,7 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
-tc_cls_act_func_proto(enum bpf_func_id func_id)
+tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_skb_store_bytes:
@@ -3781,7 +3781,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
-xdp_func_proto(enum bpf_func_id func_id)
+xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
@@ -3804,7 +3804,7 @@ xdp_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
-lwt_inout_func_proto(enum bpf_func_id func_id)
+lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_skb_load_bytes:
@@ -3831,7 +3831,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
-	sock_ops_func_proto(enum bpf_func_id func_id)
+sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_setsockopt:
@@ -3847,7 +3847,8 @@ static const struct bpf_func_proto *
 	}
 }
 
-static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_msg_redirect_map:
@@ -3863,7 +3864,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *
+sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_skb_store_bytes:
@@ -3888,7 +3890,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
-lwt_xmit_func_proto(enum bpf_func_id func_id)
+lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_skb_get_tunnel_key:
@@ -3918,11 +3920,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
 	default:
-		return lwt_inout_func_proto(func_id);
+		return lwt_inout_func_proto(func_id, prog);
 	}
 }
 
 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
+				    const struct bpf_prog *prog,
 				    struct bpf_insn_access_aux *info)
 {
 	const int size_default = sizeof(__u32);
@@ -3966,6 +3969,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 
 static bool sk_filter_is_valid_access(int off, int size,
 				      enum bpf_access_type type,
+				      const struct bpf_prog *prog,
 				      struct bpf_insn_access_aux *info)
 {
 	switch (off) {
@@ -3986,11 +3990,12 @@ static bool sk_filter_is_valid_access(int off, int size,
 		}
 	}
 
-	return bpf_skb_is_valid_access(off, size, type, info);
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
 static bool lwt_is_valid_access(int off, int size,
 				enum bpf_access_type type,
+				const struct bpf_prog *prog,
 				struct bpf_insn_access_aux *info)
 {
 	switch (off) {
@@ -4020,11 +4025,12 @@ static bool lwt_is_valid_access(int off, int size,
 		break;
 	}
 
-	return bpf_skb_is_valid_access(off, size, type, info);
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
 static bool sock_filter_is_valid_access(int off, int size,
 					enum bpf_access_type type,
+					const struct bpf_prog *prog,
 					struct bpf_insn_access_aux *info)
 {
 	if (type == BPF_WRITE) {
@@ -4096,6 +4102,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
 
 static bool tc_cls_act_is_valid_access(int off, int size,
 				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
 				       struct bpf_insn_access_aux *info)
 {
 	if (type == BPF_WRITE) {
@@ -4125,7 +4132,7 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 		return false;
 	}
 
-	return bpf_skb_is_valid_access(off, size, type, info);
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
 static bool __is_valid_xdp_access(int off, int size)
@@ -4142,6 +4149,7 @@ static bool __is_valid_xdp_access(int off, int size)
 
 static bool xdp_is_valid_access(int off, int size,
 				enum bpf_access_type type,
+				const struct bpf_prog *prog,
 				struct bpf_insn_access_aux *info)
 {
 	if (type == BPF_WRITE)
@@ -4174,6 +4182,7 @@ EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
 static bool sock_ops_is_valid_access(int off, int size,
 				     enum bpf_access_type type,
+				     const struct bpf_prog *prog,
 				     struct bpf_insn_access_aux *info)
 {
 	const int size_default = sizeof(__u32);
@@ -4220,6 +4229,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
 
 static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
+				   const struct bpf_prog *prog,
 				   struct bpf_insn_access_aux *info)
 {
 	switch (off) {
@@ -4249,11 +4259,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 		break;
 	}
 
-	return bpf_skb_is_valid_access(off, size, type, info);
+	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
 static bool sk_msg_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
+				   const struct bpf_prog *prog,
 				   struct bpf_insn_access_aux *info)
 {
 	if (type == BPF_WRITE)
-- 
cgit v1.2.3


From 4fbac77d2d092b475dda9eea66da674369665427 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:02 -0700
Subject: bpf: Hooks for sys_bind

== The problem ==

There is a use-case when all processes inside a cgroup should use one
single IP address on a host that has multiple IP configured.  Those
processes should use the IP for both ingress and egress, for TCP and UDP
traffic. So TCP/UDP servers should be bound to that IP to accept
incoming connections on it, and TCP/UDP clients should make outgoing
connections from that IP. It should not require changing application
code since it's often not possible.

Currently it's solved by intercepting glibc wrappers around syscalls
such as `bind(2)` and `connect(2)`. It's done by a shared library that
is preloaded for every process in a cgroup so that whenever TCP/UDP
server calls `bind(2)`, the library replaces IP in sockaddr before
passing arguments to syscall. When application calls `connect(2)` the
library transparently binds the local end of connection to that IP
(`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty).

Shared library approach is fragile though, e.g.:
* some applications clear env vars (incl. `LD_PRELOAD`);
* `/etc/ld.so.preload` doesn't help since some applications are linked
  with option `-z nodefaultlib`;
* other applications don't use glibc and there is nothing to intercept.

== The solution ==

The patch provides much more reliable in-kernel solution for the 1st
part of the problem: binding TCP/UDP servers on desired IP. It does not
depend on application environment and implementation details (whether
glibc is used or not).

It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and
attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND`
(similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`).

The new program type is intended to be used with sockets (`struct sock`)
in a cgroup and provided by user `struct sockaddr`. Pointers to both of
them are parts of the context passed to programs of newly added types.

The new attach types provides hooks in `bind(2)` system call for both
IPv4 and IPv6 so that one can write a program to override IP addresses
and ports user program tries to bind to and apply such a program for
whole cgroup.

== Implementation notes ==

[1]
Separate attach types for `AF_INET` and `AF_INET6` are added
intentionally to prevent reading/writing to offsets that don't make
sense for corresponding socket family. E.g. if user passes `sockaddr_in`
it doesn't make sense to read from / write to `user_ip6[]` context
fields.

[2]
The write access to `struct bpf_sock_addr_kern` is implemented using
special field as an additional "register".

There are just two registers in `sock_addr_convert_ctx_access`: `src`
with value to write and `dst` with pointer to context that can't be
changed not to break later instructions. But the fields, allowed to
write to, are not available directly and to access them address of
corresponding pointer has to be loaded first. To get additional register
the 1st not used by `src` and `dst` one is taken, its content is saved
to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load
address of pointer field, and finally the register's content is restored
from the temporary field after writing `src` value.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c   | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c  |   7 ++
 net/ipv6/af_inet6.c |   7 ++
 3 files changed, 246 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 7790fd128614..c08e5b121558 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3698,6 +3698,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static const struct bpf_func_proto *
+sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	/* inet and inet6 sockets are created in a process
+	 * context so there is always a valid uid/gid
+	 */
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -4180,6 +4194,69 @@ void bpf_warn_invalid_xdp_action(u32 act)
 }
 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
 
+static bool sock_addr_is_valid_access(int off, int size,
+				      enum bpf_access_type type,
+				      const struct bpf_prog *prog,
+				      struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (off < 0 || off >= sizeof(struct bpf_sock_addr))
+		return false;
+	if (off % size != 0)
+		return false;
+
+	/* Disallow access to IPv6 fields from IPv4 contex and vise
+	 * versa.
+	 */
+	switch (off) {
+	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_INET4_BIND:
+			break;
+		default:
+			return false;
+		}
+		break;
+	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_INET6_BIND:
+			break;
+		default:
+			return false;
+		}
+		break;
+	}
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
+	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+		/* Only narrow read access allowed for now. */
+		if (type == BPF_READ) {
+			bpf_ctx_record_field_size(info, size_default);
+			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+				return false;
+		} else {
+			if (size != size_default)
+				return false;
+		}
+		break;
+	case bpf_ctx_range(struct bpf_sock_addr, user_port):
+		if (size != size_default)
+			return false;
+		break;
+	default:
+		if (type == BPF_READ) {
+			if (size != size_default)
+				return false;
+		} else {
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static bool sock_ops_is_valid_access(int off, int size,
 				     enum bpf_access_type type,
 				     const struct bpf_prog *prog,
@@ -4724,6 +4801,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
+ * context Structure, F is Field in context structure that contains a pointer
+ * to Nested Structure of type NS that has the field NF.
+ *
+ * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
+ * sure that SIZE is not greater than actual size of S.F.NF.
+ *
+ * If offset OFF is provided, the load happens from that offset relative to
+ * offset of NF.
+ */
+#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF)	       \
+	do {								       \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,     \
+				      si->src_reg, offsetof(S, F));	       \
+		*insn++ = BPF_LDX_MEM(					       \
+			SIZE, si->dst_reg, si->dst_reg,			       \
+			bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF),	       \
+				       target_size)			       \
+				+ OFF);					       \
+	} while (0)
+
+#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF)			       \
+	SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF,		       \
+					     BPF_FIELD_SIZEOF(NS, NF), 0)
+
+/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
+ * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
+ *
+ * It doesn't support SIZE argument though since narrow stores are not
+ * supported for now.
+ *
+ * In addition it uses Temporary Field TF (member of struct S) as the 3rd
+ * "register" since two registers available in convert_ctx_access are not
+ * enough: we can't override neither SRC, since it contains value to store, nor
+ * DST since it contains pointer to context that may be used by later
+ * instructions. But we need a temporary place to save pointer to nested
+ * structure whose field we want to store to.
+ */
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF)		       \
+	do {								       \
+		int tmp_reg = BPF_REG_9;				       \
+		if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)	       \
+			--tmp_reg;					       \
+		if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)	       \
+			--tmp_reg;					       \
+		*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg,	       \
+				      offsetof(S, TF));			       \
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,	       \
+				      si->dst_reg, offsetof(S, F));	       \
+		*insn++ = BPF_STX_MEM(					       \
+			BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg,	       \
+			bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF),	       \
+				       target_size)			       \
+				+ OFF);					       \
+		*insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg,	       \
+				      offsetof(S, TF));			       \
+	} while (0)
+
+#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
+						      TF)		       \
+	do {								       \
+		if (type == BPF_WRITE) {				       \
+			SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF,    \
+							 TF);		       \
+		} else {						       \
+			SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(		       \
+				S, NS, F, NF, SIZE, OFF);  \
+		}							       \
+	} while (0)
+
+#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF)		       \
+	SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(			       \
+		S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
+
+static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
+					const struct bpf_insn *si,
+					struct bpf_insn *insn_buf,
+					struct bpf_prog *prog, u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+	int off;
+
+	switch (si->off) {
+	case offsetof(struct bpf_sock_addr, user_family):
+		SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+					    struct sockaddr, uaddr, sa_family);
+		break;
+
+	case offsetof(struct bpf_sock_addr, user_ip4):
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
+			sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
+		break;
+
+	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
+		off = si->off;
+		off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+			sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
+			tmp_reg);
+		break;
+
+	case offsetof(struct bpf_sock_addr, user_port):
+		/* To get port we need to know sa_family first and then treat
+		 * sockaddr as either sockaddr_in or sockaddr_in6.
+		 * Though we can simplify since port field has same offset and
+		 * size in both structures.
+		 * Here we check this invariant and use just one of the
+		 * structures if it's true.
+		 */
+		BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
+			     offsetof(struct sockaddr_in6, sin6_port));
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) !=
+			     FIELD_SIZEOF(struct sockaddr_in6, sin6_port));
+		SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
+						     struct sockaddr_in6, uaddr,
+						     sin6_port, tmp_reg);
+		break;
+
+	case offsetof(struct bpf_sock_addr, family):
+		SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+					    struct sock, sk, sk_family);
+		break;
+
+	case offsetof(struct bpf_sock_addr, type):
+		SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct sock, sk,
+			__sk_flags_offset, BPF_W, 0);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+		break;
+
+	case offsetof(struct bpf_sock_addr, protocol):
+		SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
+			struct bpf_sock_addr_kern, struct sock, sk,
+			__sk_flags_offset, BPF_W, 0);
+		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
+		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
+					SK_FL_PROTO_SHIFT);
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 				       const struct bpf_insn *si,
 				       struct bpf_insn *insn_buf,
@@ -5181,6 +5404,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = {
 const struct bpf_prog_ops cg_sock_prog_ops = {
 };
 
+const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
+	.get_func_proto		= sock_addr_func_proto,
+	.is_valid_access	= sock_addr_is_valid_access,
+	.convert_ctx_access	= sock_addr_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sock_addr_prog_ops = {
+};
+
 const struct bpf_verifier_ops sock_ops_verifier_ops = {
 	.get_func_proto		= sock_ops_func_proto,
 	.is_valid_access	= sock_ops_is_valid_access,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e8c7fad8c329..2dec266507dc 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -450,6 +450,13 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr_len < sizeof(struct sockaddr_in))
 		goto out;
 
+	/* BPF prog is run before any checks are done so that if the prog
+	 * changes context in a wrong way it will be caught.
+	 */
+	err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
+	if (err)
+		goto out;
+
 	if (addr->sin_family != AF_INET) {
 		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
 		 * only if s_addr is INADDR_ANY.
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index dbbe04018813..fa24e3f06ac6 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -295,6 +295,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (addr_len < SIN6_LEN_RFC2133)
 		return -EINVAL;
 
+	/* BPF prog is run before any checks are done so that if the prog
+	 * changes context in a wrong way it will be caught.
+	 */
+	err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+	if (err)
+		return err;
+
 	if (addr->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
-- 
cgit v1.2.3


From 3679d585bbc07a1ac4448d5b478b492cad3587ce Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:04 -0700
Subject: net: Introduce __inet_bind() and __inet6_bind

Refactor `bind()` code to make it ready to be called from BPF helper
function `bpf_bind()` (will be added soon). Implementation of
`inet_bind()` and `inet6_bind()` is separated into `__inet_bind()` and
`__inet6_bind()` correspondingly. These function can be used from both
`sk_prot->bind` and `bpf_bind()` contexts.

New functions have two additional arguments.

`force_bind_address_no_port` forces binding to IP only w/o checking
`inet_sock.bind_address_no_port` field. It'll allow to bind local end of
a connection to desired IP in `bpf_bind()` w/o changing
`bind_address_no_port` field of a socket. It's useful since `bpf_bind()`
can return an error and we'd need to restore original value of
`bind_address_no_port` in that case if we changed this before calling to
the helper.

`with_lock` specifies whether to lock socket when working with `struct
sk` or not. The argument is set to `true` for `sk_prot->bind`, i.e. old
behavior is preserved. But it will be set to `false` for `bpf_bind()`
use-case. The reason is all call-sites, where `bpf_bind()` will be
called, already hold that socket lock.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/ipv4/af_inet.c  | 39 ++++++++++++++++++++++++---------------
 net/ipv6/af_inet6.c | 37 ++++++++++++++++++++++++-------------
 2 files changed, 48 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2dec266507dc..e203a39d6988 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -432,30 +432,37 @@ EXPORT_SYMBOL(inet_release);
 
 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
-	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
 	struct sock *sk = sock->sk;
-	struct inet_sock *inet = inet_sk(sk);
-	struct net *net = sock_net(sk);
-	unsigned short snum;
-	int chk_addr_ret;
-	u32 tb_id = RT_TABLE_LOCAL;
 	int err;
 
 	/* If the socket has its own bind function then use it. (RAW) */
 	if (sk->sk_prot->bind) {
-		err = sk->sk_prot->bind(sk, uaddr, addr_len);
-		goto out;
+		return sk->sk_prot->bind(sk, uaddr, addr_len);
 	}
-	err = -EINVAL;
 	if (addr_len < sizeof(struct sockaddr_in))
-		goto out;
+		return -EINVAL;
 
 	/* BPF prog is run before any checks are done so that if the prog
 	 * changes context in a wrong way it will be caught.
 	 */
 	err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
 	if (err)
-		goto out;
+		return err;
+
+	return __inet_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet_bind);
+
+int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+		bool force_bind_address_no_port, bool with_lock)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct net *net = sock_net(sk);
+	unsigned short snum;
+	int chk_addr_ret;
+	u32 tb_id = RT_TABLE_LOCAL;
+	int err;
 
 	if (addr->sin_family != AF_INET) {
 		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
@@ -499,7 +506,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	 *      would be illegal to use them (multicast/broadcast) in
 	 *      which case the sending device address is used.
 	 */
-	lock_sock(sk);
+	if (with_lock)
+		lock_sock(sk);
 
 	/* Check these errors (active socket, double bind). */
 	err = -EINVAL;
@@ -511,7 +519,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		inet->inet_saddr = 0;  /* Use device */
 
 	/* Make sure we are allowed to bind here. */
-	if ((snum || !inet->bind_address_no_port) &&
+	if ((snum || !(inet->bind_address_no_port ||
+		       force_bind_address_no_port)) &&
 	    sk->sk_prot->get_port(sk, snum)) {
 		inet->inet_saddr = inet->inet_rcv_saddr = 0;
 		err = -EADDRINUSE;
@@ -528,11 +537,11 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	sk_dst_reset(sk);
 	err = 0;
 out_release_sock:
-	release_sock(sk);
+	if (with_lock)
+		release_sock(sk);
 out:
 	return err;
 }
-EXPORT_SYMBOL(inet_bind);
 
 int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 		       int addr_len, int flags)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fa24e3f06ac6..13110bee5c14 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -277,15 +277,7 @@ out_rcu_unlock:
 /* bind for INET6 API */
 int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
-	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
 	struct sock *sk = sock->sk;
-	struct inet_sock *inet = inet_sk(sk);
-	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct net *net = sock_net(sk);
-	__be32 v4addr = 0;
-	unsigned short snum;
-	bool saved_ipv6only;
-	int addr_type = 0;
 	int err = 0;
 
 	/* If the socket has its own bind function then use it. */
@@ -302,11 +294,28 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (err)
 		return err;
 
+	return __inet6_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet6_bind);
+
+int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+		 bool force_bind_address_no_port, bool with_lock)
+{
+	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	struct net *net = sock_net(sk);
+	__be32 v4addr = 0;
+	unsigned short snum;
+	bool saved_ipv6only;
+	int addr_type = 0;
+	int err = 0;
+
 	if (addr->sin6_family != AF_INET6)
 		return -EAFNOSUPPORT;
 
 	addr_type = ipv6_addr_type(&addr->sin6_addr);
-	if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM)
+	if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM)
 		return -EINVAL;
 
 	snum = ntohs(addr->sin6_port);
@@ -314,7 +323,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
-	lock_sock(sk);
+	if (with_lock)
+		lock_sock(sk);
 
 	/* Check these errors (active socket, double bind). */
 	if (sk->sk_state != TCP_CLOSE || inet->inet_num) {
@@ -402,7 +412,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		sk->sk_ipv6only = 1;
 
 	/* Make sure we are allowed to bind here. */
-	if ((snum || !inet->bind_address_no_port) &&
+	if ((snum || !(inet->bind_address_no_port ||
+		       force_bind_address_no_port)) &&
 	    sk->sk_prot->get_port(sk, snum)) {
 		sk->sk_ipv6only = saved_ipv6only;
 		inet_reset_saddr(sk);
@@ -418,13 +429,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	inet->inet_dport = 0;
 	inet->inet_daddr = 0;
 out:
-	release_sock(sk);
+	if (with_lock)
+		release_sock(sk);
 	return err;
 out_unlock:
 	rcu_read_unlock();
 	goto out;
 }
-EXPORT_SYMBOL(inet6_bind);
 
 int inet6_release(struct socket *sock)
 {
-- 
cgit v1.2.3


From d74bad4e74ee373787a9ae24197c17b7cdc428d5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:05 -0700
Subject: bpf: Hooks for sys_connect

== The problem ==

See description of the problem in the initial patch of this patch set.

== The solution ==

The patch provides much more reliable in-kernel solution for the 2nd
part of the problem: making outgoing connecttion from desired IP.

It adds new attach types `BPF_CGROUP_INET4_CONNECT` and
`BPF_CGROUP_INET6_CONNECT` for program type
`BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both
source and destination of a connection at connect(2) time.

Local end of connection can be bound to desired IP using newly
introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though,
and doesn't support binding to port, i.e. leverages
`IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this:
* looking for a free port is expensive and can affect performance
  significantly;
* there is no use-case for port.

As for remote end (`struct sockaddr *` passed by user), both parts of it
can be overridden, remote IP and remote port. It's useful if an
application inside cgroup wants to connect to another application inside
same cgroup or to itself, but knows nothing about IP assigned to the
cgroup.

Support is added for IPv4 and IPv6, for TCP and UDP.

IPv4 and IPv6 have separate attach types for same reason as sys_bind
hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields
when user passes sockaddr_in since it'd be out-of-bound.

== Implementation notes ==

The patch introduces new field in `struct proto`: `pre_connect` that is
a pointer to a function with same signature as `connect` but is called
before it. The reason is in some cases BPF hooks should be called way
before control is passed to `sk->sk_prot->connect`. Specifically
`inet_dgram_connect` autobinds socket before calling
`sk->sk_prot->connect` and there is no way to call `bpf_bind()` from
hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since
it'd cause double-bind. On the other hand `proto.pre_connect` provides a
flexible way to add BPF hooks for connect only for necessary `proto` and
call them at desired time before `connect`. Since `bpf_bind()` is
allowed to bind only to IP and autobind in `inet_dgram_connect` binds
only port there is no chance of double-bind.

bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite
of value of `bind_address_no_port` socket field.

bpf_bind() sets `with_lock` to `false` when calling to __inet_bind()
and __inet6_bind() since all call-sites, where bpf_bind() is called,
already hold socket lock.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c   | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/af_inet.c  | 13 ++++++++++++
 net/ipv4/tcp_ipv4.c | 16 +++++++++++++++
 net/ipv4/udp.c      | 14 +++++++++++++
 net/ipv6/af_inet6.c |  5 +++++
 net/ipv6/tcp_ipv6.c | 16 +++++++++++++++
 net/ipv6/udp.c      | 20 +++++++++++++++++++
 7 files changed, 141 insertions(+)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index c08e5b121558..bdb9cadd4d27 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -33,6 +33,7 @@
 #include <linux/if_packet.h>
 #include <linux/if_arp.h>
 #include <linux/gfp.h>
+#include <net/inet_common.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <net/netlink.h>
@@ -3656,6 +3657,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
+EXPORT_SYMBOL_GPL(ipv6_bpf_stub);
+
+BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
+	   int, addr_len)
+{
+#ifdef CONFIG_INET
+	struct sock *sk = ctx->sk;
+	int err;
+
+	/* Binding to port can be expensive so it's prohibited in the helper.
+	 * Only binding to IP is supported.
+	 */
+	err = -EINVAL;
+	if (addr->sa_family == AF_INET) {
+		if (addr_len < sizeof(struct sockaddr_in))
+			return err;
+		if (((struct sockaddr_in *)addr)->sin_port != htons(0))
+			return err;
+		return __inet_bind(sk, addr, addr_len, true, false);
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (addr->sa_family == AF_INET6) {
+		if (addr_len < SIN6_LEN_RFC2133)
+			return err;
+		if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
+			return err;
+		/* ipv6_bpf_stub cannot be NULL, since it's called from
+		 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
+		 */
+		return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
+#endif /* CONFIG_IPV6 */
+	}
+#endif /* CONFIG_INET */
+
+	return -EAFNOSUPPORT;
+}
+
+static const struct bpf_func_proto bpf_bind_proto = {
+	.func		= bpf_bind,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -3707,6 +3754,14 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	 */
 	case BPF_FUNC_get_current_uid_gid:
 		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_bind:
+		switch (prog->expected_attach_type) {
+		case BPF_CGROUP_INET4_CONNECT:
+		case BPF_CGROUP_INET6_CONNECT:
+			return &bpf_bind_proto;
+		default:
+			return NULL;
+		}
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -4213,6 +4268,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET4_BIND:
+		case BPF_CGROUP_INET4_CONNECT:
 			break;
 		default:
 			return false;
@@ -4221,6 +4277,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 	case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
 		switch (prog->expected_attach_type) {
 		case BPF_CGROUP_INET6_BIND:
+		case BPF_CGROUP_INET6_CONNECT:
 			break;
 		default:
 			return false;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e203a39d6988..488fe26ac8e5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -547,12 +547,19 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 		       int addr_len, int flags)
 {
 	struct sock *sk = sock->sk;
+	int err;
 
 	if (addr_len < sizeof(uaddr->sa_family))
 		return -EINVAL;
 	if (uaddr->sa_family == AF_UNSPEC)
 		return sk->sk_prot->disconnect(sk, flags);
 
+	if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
+		err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
+		if (err)
+			return err;
+	}
+
 	if (!inet_sk(sk)->inet_num && inet_autobind(sk))
 		return -EAGAIN;
 	return sk->sk_prot->connect(sk, uaddr, addr_len);
@@ -633,6 +640,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		if (sk->sk_state != TCP_CLOSE)
 			goto out;
 
+		if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
+			err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
+			if (err)
+				goto out;
+		}
+
 		err = sk->sk_prot->connect(sk, uaddr, addr_len);
 		if (err < 0)
 			goto out;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2c6aec2643e8..3c11d992d784 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -140,6 +140,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 }
 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 
+static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+			      int addr_len)
+{
+	/* This check is replicated from tcp_v4_connect() and intended to
+	 * prevent BPF program called below from accessing bytes that are out
+	 * of the bound specified by user in addr_len.
+	 */
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	sock_owned_by_me(sk);
+
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
+}
+
 /* This will initiate an outgoing connection. */
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
@@ -2409,6 +2424,7 @@ struct proto tcp_prot = {
 	.name			= "TCP",
 	.owner			= THIS_MODULE,
 	.close			= tcp_close,
+	.pre_connect		= tcp_v4_pre_connect,
 	.connect		= tcp_v4_connect,
 	.disconnect		= tcp_disconnect,
 	.accept			= inet_csk_accept,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 908fc02fb4f8..9c6c77fec963 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1658,6 +1658,19 @@ csum_copy_err:
 	goto try_again;
 }
 
+int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	/* This check is replicated from __ip4_datagram_connect() and
+	 * intended to prevent BPF program called below from accessing bytes
+	 * that are out of the bound specified by user in addr_len.
+	 */
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+}
+EXPORT_SYMBOL(udp_pre_connect);
+
 int __udp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -2530,6 +2543,7 @@ struct proto udp_prot = {
 	.name			= "UDP",
 	.owner			= THIS_MODULE,
 	.close			= udp_lib_close,
+	.pre_connect		= udp_pre_connect,
 	.connect		= ip4_datagram_connect,
 	.disconnect		= udp_disconnect,
 	.ioctl			= udp_ioctl,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 13110bee5c14..566cec0e0a44 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -887,6 +887,10 @@ static const struct ipv6_stub ipv6_stub_impl = {
 	.nd_tbl	= &nd_tbl,
 };
 
+static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
+	.inet6_bind = __inet6_bind,
+};
+
 static int __init inet6_init(void)
 {
 	struct list_head *r;
@@ -1043,6 +1047,7 @@ static int __init inet6_init(void)
 	/* ensure that ipv6 stubs are visible only after ipv6 is ready */
 	wmb();
 	ipv6_stub = &ipv6_stub_impl;
+	ipv6_bpf_stub = &ipv6_bpf_stub_impl;
 out:
 	return err;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5425d7b100ee..6469b741cf5a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -117,6 +117,21 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
 				   ipv6_hdr(skb)->saddr.s6_addr32);
 }
 
+static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+			      int addr_len)
+{
+	/* This check is replicated from tcp_v6_connect() and intended to
+	 * prevent BPF program called below from accessing bytes that are out
+	 * of the bound specified by user in addr_len.
+	 */
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	sock_owned_by_me(sk);
+
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
+}
+
 static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 			  int addr_len)
 {
@@ -1925,6 +1940,7 @@ struct proto tcpv6_prot = {
 	.name			= "TCPv6",
 	.owner			= THIS_MODULE,
 	.close			= tcp_close,
+	.pre_connect		= tcp_v6_pre_connect,
 	.connect		= tcp_v6_connect,
 	.disconnect		= tcp_disconnect,
 	.accept			= inet_csk_accept,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index ad30f5e31969..6861ed479469 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -957,6 +957,25 @@ static void udp_v6_flush_pending_frames(struct sock *sk)
 	}
 }
 
+static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+			     int addr_len)
+{
+	/* The following checks are replicated from __ip6_datagram_connect()
+	 * and intended to prevent BPF program called below from accessing
+	 * bytes that are out of the bound specified by user in addr_len.
+	 */
+	if (uaddr->sa_family == AF_INET) {
+		if (__ipv6_only_sock(sk))
+			return -EAFNOSUPPORT;
+		return udp_pre_connect(sk, uaddr, addr_len);
+	}
+
+	if (addr_len < SIN6_LEN_RFC2133)
+		return -EINVAL;
+
+	return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+}
+
 /**
  *	udp6_hwcsum_outgoing  -  handle outgoing HW checksumming
  *	@sk:	socket we are sending on
@@ -1512,6 +1531,7 @@ struct proto udpv6_prot = {
 	.name			= "UDPv6",
 	.owner			= THIS_MODULE,
 	.close			= udp_lib_close,
+	.pre_connect		= udpv6_pre_connect,
 	.connect		= ip6_datagram_connect,
 	.disconnect		= udp_disconnect,
 	.ioctl			= udp_ioctl,
-- 
cgit v1.2.3


From aac3fc320d9404f2665a8b1249dc3170d5fa3caf Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:07 -0700
Subject: bpf: Post-hooks for sys_bind

"Post-hooks" are hooks that are called right before returning from
sys_bind. At this time IP and port are already allocated and no further
changes to `struct sock` can happen before returning from sys_bind but
BPF program has a chance to inspect the socket and change sys_bind
result.

Specifically it can e.g. inspect what port was allocated and if it
doesn't satisfy some policy, BPF program can force sys_bind to fail and
return EPERM to user.

Another example of usage is recording the IP:port pair to some map to
use it in later calls to sys_connect. E.g. if some TCP server inside
cgroup was bound to some IP:port_n, it can be recorded to a map. And
later when some TCP client inside same cgroup is trying to connect to
127.0.0.1:port_n, BPF hook for sys_connect can override the destination
and connect application to IP:port_n instead of 127.0.0.1:port_n. That
helps forcing all applications inside a cgroup to use desired IP and not
break those applications if they e.g. use localhost to communicate
between each other.

== Implementation details ==

Post-hooks are implemented as two new attach types
`BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for
existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`.

Separate attach types for IPv4 and IPv6 are introduced to avoid access
to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from
`inet6_bind()` since those fields might not make sense in such cases.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c   | 116 +++++++++++++++++++++++++++++++++++++++++++++-------
 net/ipv4/af_inet.c  |  18 +++++---
 net/ipv6/af_inet6.c |  21 ++++++----
 3 files changed, 128 insertions(+), 27 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index bdb9cadd4d27..d31aff93270d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4097,30 +4097,80 @@ static bool lwt_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, prog, info);
 }
 
-static bool sock_filter_is_valid_access(int off, int size,
-					enum bpf_access_type type,
-					const struct bpf_prog *prog,
-					struct bpf_insn_access_aux *info)
+
+/* Attach type specific accesses */
+static bool __sock_filter_check_attach_type(int off,
+					    enum bpf_access_type access_type,
+					    enum bpf_attach_type attach_type)
 {
-	if (type == BPF_WRITE) {
-		switch (off) {
-		case offsetof(struct bpf_sock, bound_dev_if):
-		case offsetof(struct bpf_sock, mark):
-		case offsetof(struct bpf_sock, priority):
-			break;
+	switch (off) {
+	case offsetof(struct bpf_sock, bound_dev_if):
+	case offsetof(struct bpf_sock, mark):
+	case offsetof(struct bpf_sock, priority):
+		switch (attach_type) {
+		case BPF_CGROUP_INET_SOCK_CREATE:
+			goto full_access;
+		default:
+			return false;
+		}
+	case bpf_ctx_range(struct bpf_sock, src_ip4):
+		switch (attach_type) {
+		case BPF_CGROUP_INET4_POST_BIND:
+			goto read_only;
+		default:
+			return false;
+		}
+	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+		switch (attach_type) {
+		case BPF_CGROUP_INET6_POST_BIND:
+			goto read_only;
+		default:
+			return false;
+		}
+	case bpf_ctx_range(struct bpf_sock, src_port):
+		switch (attach_type) {
+		case BPF_CGROUP_INET4_POST_BIND:
+		case BPF_CGROUP_INET6_POST_BIND:
+			goto read_only;
 		default:
 			return false;
 		}
 	}
+read_only:
+	return access_type == BPF_READ;
+full_access:
+	return true;
+}
+
+static bool __sock_filter_check_size(int off, int size,
+				     struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
 
-	if (off < 0 || off + size > sizeof(struct bpf_sock))
+	switch (off) {
+	case bpf_ctx_range(struct bpf_sock, src_ip4):
+	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+		bpf_ctx_record_field_size(info, size_default);
+		return bpf_ctx_narrow_access_ok(off, size, size_default);
+	}
+
+	return size == size_default;
+}
+
+static bool sock_filter_is_valid_access(int off, int size,
+					enum bpf_access_type type,
+					const struct bpf_prog *prog,
+					struct bpf_insn_access_aux *info)
+{
+	if (off < 0 || off >= sizeof(struct bpf_sock))
 		return false;
-	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-	if (size != sizeof(__u32))
+	if (!__sock_filter_check_attach_type(off, type,
+					     prog->expected_attach_type))
+		return false;
+	if (!__sock_filter_check_size(off, size, info))
 		return false;
-
 	return true;
 }
 
@@ -4728,6 +4778,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 					  struct bpf_prog *prog, u32 *target_size)
 {
 	struct bpf_insn *insn = insn_buf;
+	int off;
 
 	switch (si->off) {
 	case offsetof(struct bpf_sock, bound_dev_if):
@@ -4783,6 +4834,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
 		break;
+
+	case offsetof(struct bpf_sock, src_ip4):
+		*insn++ = BPF_LDX_MEM(
+			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock_common, skc_rcv_saddr,
+				       FIELD_SIZEOF(struct sock_common,
+						    skc_rcv_saddr),
+				       target_size));
+		break;
+
+	case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+		off = si->off;
+		off -= offsetof(struct bpf_sock, src_ip6[0]);
+		*insn++ = BPF_LDX_MEM(
+			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+			bpf_target_off(
+				struct sock_common,
+				skc_v6_rcv_saddr.s6_addr32[0],
+				FIELD_SIZEOF(struct sock_common,
+					     skc_v6_rcv_saddr.s6_addr32[0]),
+				target_size) + off);
+#else
+		(void)off;
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+		break;
+
+	case offsetof(struct bpf_sock, src_port):
+		*insn++ = BPF_LDX_MEM(
+			BPF_FIELD_SIZEOF(struct sock_common, skc_num),
+			si->dst_reg, si->src_reg,
+			bpf_target_off(struct sock_common, skc_num,
+				       FIELD_SIZEOF(struct sock_common,
+						    skc_num),
+				       target_size));
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 488fe26ac8e5..142d4c35b493 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -519,12 +519,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 		inet->inet_saddr = 0;  /* Use device */
 
 	/* Make sure we are allowed to bind here. */
-	if ((snum || !(inet->bind_address_no_port ||
-		       force_bind_address_no_port)) &&
-	    sk->sk_prot->get_port(sk, snum)) {
-		inet->inet_saddr = inet->inet_rcv_saddr = 0;
-		err = -EADDRINUSE;
-		goto out_release_sock;
+	if (snum || !(inet->bind_address_no_port ||
+		      force_bind_address_no_port)) {
+		if (sk->sk_prot->get_port(sk, snum)) {
+			inet->inet_saddr = inet->inet_rcv_saddr = 0;
+			err = -EADDRINUSE;
+			goto out_release_sock;
+		}
+		err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
+		if (err) {
+			inet->inet_saddr = inet->inet_rcv_saddr = 0;
+			goto out_release_sock;
+		}
 	}
 
 	if (inet->inet_rcv_saddr)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 566cec0e0a44..41f50472679d 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -412,13 +412,20 @@ int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
 		sk->sk_ipv6only = 1;
 
 	/* Make sure we are allowed to bind here. */
-	if ((snum || !(inet->bind_address_no_port ||
-		       force_bind_address_no_port)) &&
-	    sk->sk_prot->get_port(sk, snum)) {
-		sk->sk_ipv6only = saved_ipv6only;
-		inet_reset_saddr(sk);
-		err = -EADDRINUSE;
-		goto out;
+	if (snum || !(inet->bind_address_no_port ||
+		      force_bind_address_no_port)) {
+		if (sk->sk_prot->get_port(sk, snum)) {
+			sk->sk_ipv6only = saved_ipv6only;
+			inet_reset_saddr(sk);
+			err = -EADDRINUSE;
+			goto out;
+		}
+		err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
+		if (err) {
+			sk->sk_ipv6only = saved_ipv6only;
+			inet_reset_saddr(sk);
+			goto out;
+		}
 	}
 
 	if (addr_type != IPV6_ADDR_ANY)
-- 
cgit v1.2.3


From f40aa23339e2d06b1a8daaece5a511bb1c4f704c Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 30 Mar 2018 13:46:18 +0300
Subject: net: bridge: set min MTU on port events and allow user to set max

Recently the bridge was changed to automatically set maximum MTU on port
events (add/del/changemtu) when vlan filtering is enabled, but that
actually changes behaviour in a way which breaks some setups and can lead
to packet drops. In order to still allow that maximum to be set while being
compatible, we add the ability for the user to tune the bridge MTU up to
the maximum when vlan filtering is enabled, but that has to be done
explicitly and all port events (add/del/changemtu) lead to resetting that
MTU to the minimum as before.

Suggested-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c         |  2 +-
 net/bridge/br_device.c  |  3 ++-
 net/bridge/br_if.c      | 43 ++++++++++++++-----------------------------
 net/bridge/br_private.h |  2 +-
 4 files changed, 18 insertions(+), 32 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br.c b/net/bridge/br.c
index 26e1616b2c90..565ff055813b 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -52,7 +52,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	switch (event) {
 	case NETDEV_CHANGEMTU:
-		dev_set_mtu(br->dev, br_mtu(br));
+		dev_set_mtu(br->dev, br_mtu(br, false));
 		break;
 
 	case NETDEV_CHANGEADDR:
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 278fc999d355..edb9967eb165 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -224,7 +224,8 @@ static void br_get_stats64(struct net_device *dev,
 static int br_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct net_bridge *br = netdev_priv(dev);
-	if (new_mtu > br_mtu(br))
+
+	if (new_mtu > br_mtu(br, br_vlan_enabled(dev)))
 		return -EINVAL;
 
 	dev->mtu = new_mtu;
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 87b2afd455c7..7d5dc5a91084 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -424,41 +424,26 @@ int br_del_bridge(struct net *net, const char *name)
 	return ret;
 }
 
-static bool min_mtu(int a, int b)
-{
-	return a < b ? 1 : 0;
-}
-
-static bool max_mtu(int a, int b)
-{
-	return a > b ? 1 : 0;
-}
-
-/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
-static int __br_mtu(const struct net_bridge *br, bool (compare_fn)(int, int))
+/* MTU of the bridge pseudo-device: ETH_DATA_LEN if there are no ports, the
+ * minimum of the ports if @max is false or the maximum if it's true
+ */
+int br_mtu(const struct net_bridge *br, bool max)
 {
 	const struct net_bridge_port *p;
-	int mtu = 0;
+	int ret_mtu = 0;
 
 	ASSERT_RTNL();
 
-	if (list_empty(&br->port_list))
-		mtu = ETH_DATA_LEN;
-	else {
-		list_for_each_entry(p, &br->port_list, list) {
-			if (!mtu || compare_fn(p->dev->mtu, mtu))
-				mtu = p->dev->mtu;
+	list_for_each_entry(p, &br->port_list, list) {
+		if (!max) {
+			if (!ret_mtu || ret_mtu > p->dev->mtu)
+				ret_mtu = p->dev->mtu;
+		} else if (p->dev->mtu > ret_mtu) {
+			ret_mtu = p->dev->mtu;
 		}
 	}
-	return mtu;
-}
 
-int br_mtu(const struct net_bridge *br)
-{
-	if (br_vlan_enabled(br->dev))
-		return __br_mtu(br, max_mtu);
-	else
-		return __br_mtu(br, min_mtu);
+	return ret_mtu ? ret_mtu : ETH_DATA_LEN;
 }
 
 static void br_set_gso_limits(struct net_bridge *br)
@@ -612,7 +597,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	if (changed_addr)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
 
-	dev_set_mtu(br->dev, br_mtu(br));
+	dev_set_mtu(br->dev, br_mtu(br, false));
 	br_set_gso_limits(br);
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
@@ -659,7 +644,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
 	 */
 	del_nbp(p);
 
-	dev_set_mtu(br->dev, br_mtu(br));
+	dev_set_mtu(br->dev, br_mtu(br, false));
 	br_set_gso_limits(br);
 
 	spin_lock_bh(&br->lock);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 048d5b51813b..586f84b9670d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -578,7 +578,7 @@ int br_del_bridge(struct net *net, const char *name);
 int br_add_if(struct net_bridge *br, struct net_device *dev,
 	      struct netlink_ext_ack *extack);
 int br_del_if(struct net_bridge *br, struct net_device *dev);
-int br_mtu(const struct net_bridge *br);
+int br_mtu(const struct net_bridge *br, bool max);
 netdev_features_t br_features_recompute(struct net_bridge *br,
 					netdev_features_t features);
 void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
-- 
cgit v1.2.3


From 804b854d374e39f5f8bff9638fd274b9a9ca7d33 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 30 Mar 2018 13:46:19 +0300
Subject: net: bridge: disable bridge MTU auto tuning if it was set manually

As Roopa noted today the biggest source of problems when configuring
bridge and ports is that the bridge MTU keeps changing automatically on
port events (add/del/changemtu). That leads to inconsistent behaviour
and network config software needs to chase the MTU and fix it on each
such event. Let's improve on that situation and allow for the user to
set any MTU within ETH_MIN/MAX limits, but once manually configured it
is the user's responsibility to keep it correct afterwards.

In case the MTU isn't manually set - the behaviour reverts to the
previous and the bridge follows the minimum MTU.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br.c         |  2 +-
 net/bridge/br_device.c  |  5 ++---
 net/bridge/br_if.c      | 36 +++++++++++++++++++++---------------
 net/bridge/br_private.h |  3 ++-
 4 files changed, 26 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br.c b/net/bridge/br.c
index 565ff055813b..671d13c10f6f 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -52,7 +52,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
 
 	switch (event) {
 	case NETDEV_CHANGEMTU:
-		dev_set_mtu(br->dev, br_mtu(br, false));
+		br_mtu_auto_adjust(br);
 		break;
 
 	case NETDEV_CHANGEADDR:
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index edb9967eb165..e682a668ce57 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -225,11 +225,10 @@ static int br_change_mtu(struct net_device *dev, int new_mtu)
 {
 	struct net_bridge *br = netdev_priv(dev);
 
-	if (new_mtu > br_mtu(br, br_vlan_enabled(dev)))
-		return -EINVAL;
-
 	dev->mtu = new_mtu;
 
+	/* this flag will be cleared if the MTU was automatically adjusted */
+	br->mtu_set_by_user = true;
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	/* remember the MTU in the rtable for PMTU */
 	dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 7d5dc5a91084..82c1a6f430b3 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -424,28 +424,34 @@ int br_del_bridge(struct net *net, const char *name)
 	return ret;
 }
 
-/* MTU of the bridge pseudo-device: ETH_DATA_LEN if there are no ports, the
- * minimum of the ports if @max is false or the maximum if it's true
- */
-int br_mtu(const struct net_bridge *br, bool max)
+/* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */
+static int br_mtu_min(const struct net_bridge *br)
 {
 	const struct net_bridge_port *p;
 	int ret_mtu = 0;
 
-	ASSERT_RTNL();
-
-	list_for_each_entry(p, &br->port_list, list) {
-		if (!max) {
-			if (!ret_mtu || ret_mtu > p->dev->mtu)
-				ret_mtu = p->dev->mtu;
-		} else if (p->dev->mtu > ret_mtu) {
+	list_for_each_entry(p, &br->port_list, list)
+		if (!ret_mtu || ret_mtu > p->dev->mtu)
 			ret_mtu = p->dev->mtu;
-		}
-	}
 
 	return ret_mtu ? ret_mtu : ETH_DATA_LEN;
 }
 
+void br_mtu_auto_adjust(struct net_bridge *br)
+{
+	ASSERT_RTNL();
+
+	/* if the bridge MTU was manually configured don't mess with it */
+	if (br->mtu_set_by_user)
+		return;
+
+	/* change to the minimum MTU and clear the flag which was set by
+	 * the bridge ndo_change_mtu callback
+	 */
+	dev_set_mtu(br->dev, br_mtu_min(br));
+	br->mtu_set_by_user = false;
+}
+
 static void br_set_gso_limits(struct net_bridge *br)
 {
 	unsigned int gso_max_size = GSO_MAX_SIZE;
@@ -597,7 +603,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
 	if (changed_addr)
 		call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
 
-	dev_set_mtu(br->dev, br_mtu(br, false));
+	br_mtu_auto_adjust(br);
 	br_set_gso_limits(br);
 
 	kobject_uevent(&p->kobj, KOBJ_ADD);
@@ -644,7 +650,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
 	 */
 	del_nbp(p);
 
-	dev_set_mtu(br->dev, br_mtu(br, false));
+	br_mtu_auto_adjust(br);
 	br_set_gso_limits(br);
 
 	spin_lock_bh(&br->lock);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 586f84b9670d..a7cb3ece5031 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -410,6 +410,7 @@ struct net_bridge {
 	int offload_fwd_mark;
 #endif
 	bool				neigh_suppress_enabled;
+	bool				mtu_set_by_user;
 	struct hlist_head		fdb_list;
 };
 
@@ -578,7 +579,7 @@ int br_del_bridge(struct net *net, const char *name);
 int br_add_if(struct net_bridge *br, struct net_device *dev,
 	      struct netlink_ext_ack *extack);
 int br_del_if(struct net_bridge *br, struct net_device *dev);
-int br_mtu(const struct net_bridge *br, bool max);
+void br_mtu_auto_adjust(struct net_bridge *br);
 netdev_features_t br_features_recompute(struct net_bridge *br,
 					netdev_features_t features);
 void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
-- 
cgit v1.2.3


From 218527fe27adaebeb81eb770459eb335517e90ee Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 29 Mar 2018 23:20:41 +0200
Subject: tipc: replace name table service range array with rb tree

The current design of the binding table has an unnecessary memory
consuming and complex data structure. It aggregates the service range
items into an array, which is expanded by a factor two every time it
becomes too small to hold a new item. Furthermore, the arrays never
shrink when the number of ranges diminishes.

We now replace this array with an RB tree that is holding the range
items as tree nodes, each range directly holding a list of bindings.

This, along with a few name changes, improves both readability and
volume of the code, as well as reducing memory consumption and hopefully
improving cache hit rate.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/core.h       |    1 +
 net/tipc/link.c       |    2 +-
 net/tipc/name_table.c | 1032 ++++++++++++++++++++++---------------------------
 net/tipc/name_table.h |    2 +-
 net/tipc/node.c       |    4 +-
 net/tipc/subscr.h     |    4 +-
 6 files changed, 477 insertions(+), 568 deletions(-)

(limited to 'net')

diff --git a/net/tipc/core.h b/net/tipc/core.h
index d0f64ca62d02..8020a6c360ff 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -58,6 +58,7 @@
 #include <linux/etherdevice.h>
 #include <net/netns/generic.h>
 #include <linux/rhashtable.h>
+#include <net/genetlink.h>
 
 struct tipc_node;
 struct tipc_bearer;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 1289b4ba404f..8f2a9496439b 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1810,7 +1810,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
 
 void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
 {
-	int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE);
+	int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE);
 
 	l->window = win;
 	l->backlog[TIPC_LOW_IMPORTANCE].limit      = max_t(u16, 50, win);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 4359605b1bec..e06c7a8907aa 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -44,52 +44,40 @@
 #include "addr.h"
 #include "node.h"
 #include "group.h"
-#include <net/genetlink.h>
-
-#define TIPC_NAMETBL_SIZE 1024		/* must be a power of 2 */
 
 /**
- * struct name_info - name sequence publication info
- * @node_list: list of publications on own node of this <type,lower,upper>
- * @all_publ: list of all publications of this <type,lower,upper>
+ * struct service_range - container for all bindings of a service range
+ * @lower: service range lower bound
+ * @upper: service range upper bound
+ * @tree_node: member of service range RB tree
+ * @local_publ: list of identical publications made from this node
+ *   Used by closest_first lookup and multicast lookup algorithm
+ * @all_publ: all publications identical to this one, whatever node and scope
+ *   Used by round-robin lookup algorithm
  */
-struct name_info {
-	struct list_head local_publ;
-	struct list_head all_publ;
-};
-
-/**
- * struct sub_seq - container for all published instances of a name sequence
- * @lower: name sequence lower bound
- * @upper: name sequence upper bound
- * @info: pointer to name sequence publication info
- */
-struct sub_seq {
+struct service_range {
 	u32 lower;
 	u32 upper;
-	struct name_info *info;
+	struct rb_node tree_node;
+	struct list_head local_publ;
+	struct list_head all_publ;
 };
 
 /**
- * struct name_seq - container for all published instances of a name type
- * @type: 32 bit 'type' value for name sequence
- * @sseq: pointer to dynamically-sized array of sub-sequences of this 'type';
- *        sub-sequences are sorted in ascending order
- * @alloc: number of sub-sequences currently in array
- * @first_free: array index of first unused sub-sequence entry
- * @ns_list: links to adjacent name sequences in hash chain
- * @subscriptions: list of subscriptions for this 'type'
- * @lock: spinlock controlling access to publication lists of all sub-sequences
+ * struct tipc_service - container for all published instances of a service type
+ * @type: 32 bit 'type' value for service
+ * @ranges: rb tree containing all service ranges for this service
+ * @service_list: links to adjacent name ranges in hash chain
+ * @subscriptions: list of subscriptions for this service type
+ * @lock: spinlock controlling access to pertaining service ranges/publications
  * @rcu: RCU callback head used for deferred freeing
  */
-struct name_seq {
+struct tipc_service {
 	u32 type;
-	struct sub_seq *sseqs;
-	u32 alloc;
-	u32 first_free;
-	struct hlist_node ns_list;
+	struct rb_root ranges;
+	struct hlist_node service_list;
 	struct list_head subscriptions;
-	spinlock_t lock;
+	spinlock_t lock; /* Covers service range list */
 	struct rcu_head rcu;
 };
 
@@ -99,17 +87,16 @@ static int hash(int x)
 }
 
 /**
- * publ_create - create a publication structure
+ * tipc_publ_create - create a publication structure
  */
-static struct publication *publ_create(u32 type, u32 lower, u32 upper,
-				       u32 scope, u32 node, u32 port,
-				       u32 key)
+static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper,
+					    u32 scope, u32 node, u32 port,
+					    u32 key)
 {
 	struct publication *publ = kzalloc(sizeof(*publ), GFP_ATOMIC);
-	if (publ == NULL) {
-		pr_warn("Publication creation failure, no memory\n");
+
+	if (!publ)
 		return NULL;
-	}
 
 	publ->type = type;
 	publ->lower = lower;
@@ -119,372 +106,298 @@ static struct publication *publ_create(u32 type, u32 lower, u32 upper,
 	publ->port = port;
 	publ->key = key;
 	INIT_LIST_HEAD(&publ->binding_sock);
+	INIT_LIST_HEAD(&publ->binding_node);
+	INIT_LIST_HEAD(&publ->local_publ);
+	INIT_LIST_HEAD(&publ->all_publ);
 	return publ;
 }
 
 /**
- * tipc_subseq_alloc - allocate a specified number of sub-sequence structures
- */
-static struct sub_seq *tipc_subseq_alloc(u32 cnt)
-{
-	return kcalloc(cnt, sizeof(struct sub_seq), GFP_ATOMIC);
-}
-
-/**
- * tipc_nameseq_create - create a name sequence structure for the specified 'type'
+ * tipc_service_create - create a service structure for the specified 'type'
  *
- * Allocates a single sub-sequence structure and sets it to all 0's.
+ * Allocates a single range structure and sets it to all 0's.
  */
-static struct name_seq *tipc_nameseq_create(u32 type, struct hlist_head *seq_head)
+static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd)
 {
-	struct name_seq *nseq = kzalloc(sizeof(*nseq), GFP_ATOMIC);
-	struct sub_seq *sseq = tipc_subseq_alloc(1);
+	struct tipc_service *service = kzalloc(sizeof(*service), GFP_ATOMIC);
 
-	if (!nseq || !sseq) {
-		pr_warn("Name sequence creation failed, no memory\n");
-		kfree(nseq);
-		kfree(sseq);
+	if (!service) {
+		pr_warn("Service creation failed, no memory\n");
 		return NULL;
 	}
 
-	spin_lock_init(&nseq->lock);
-	nseq->type = type;
-	nseq->sseqs = sseq;
-	nseq->alloc = 1;
-	INIT_HLIST_NODE(&nseq->ns_list);
-	INIT_LIST_HEAD(&nseq->subscriptions);
-	hlist_add_head_rcu(&nseq->ns_list, seq_head);
-	return nseq;
+	spin_lock_init(&service->lock);
+	service->type = type;
+	service->ranges = RB_ROOT;
+	INIT_HLIST_NODE(&service->service_list);
+	INIT_LIST_HEAD(&service->subscriptions);
+	hlist_add_head_rcu(&service->service_list, hd);
+	return service;
 }
 
 /**
- * nameseq_find_subseq - find sub-sequence (if any) matching a name instance
+ * tipc_service_find_range - find service range matching a service instance
  *
- * Very time-critical, so binary searches through sub-sequence array.
+ * Very time-critical, so binary search through range rb tree
  */
-static struct sub_seq *nameseq_find_subseq(struct name_seq *nseq,
-					   u32 instance)
+static struct service_range *tipc_service_find_range(struct tipc_service *sc,
+						     u32 instance)
 {
-	struct sub_seq *sseqs = nseq->sseqs;
-	int low = 0;
-	int high = nseq->first_free - 1;
-	int mid;
-
-	while (low <= high) {
-		mid = (low + high) / 2;
-		if (instance < sseqs[mid].lower)
-			high = mid - 1;
-		else if (instance > sseqs[mid].upper)
-			low = mid + 1;
+	struct rb_node *n = sc->ranges.rb_node;
+	struct service_range *sr;
+
+	while (n) {
+		sr = container_of(n, struct service_range, tree_node);
+		if (sr->lower > instance)
+			n = n->rb_left;
+		else if (sr->upper < instance)
+			n = n->rb_right;
 		else
-			return &sseqs[mid];
+			return sr;
 	}
 	return NULL;
 }
 
-/**
- * nameseq_locate_subseq - determine position of name instance in sub-sequence
- *
- * Returns index in sub-sequence array of the entry that contains the specified
- * instance value; if no entry contains that value, returns the position
- * where a new entry for it would be inserted in the array.
- *
- * Note: Similar to binary search code for locating a sub-sequence.
- */
-static u32 nameseq_locate_subseq(struct name_seq *nseq, u32 instance)
+static struct service_range *tipc_service_create_range(struct tipc_service *sc,
+						       u32 lower, u32 upper)
 {
-	struct sub_seq *sseqs = nseq->sseqs;
-	int low = 0;
-	int high = nseq->first_free - 1;
-	int mid;
-
-	while (low <= high) {
-		mid = (low + high) / 2;
-		if (instance < sseqs[mid].lower)
-			high = mid - 1;
-		else if (instance > sseqs[mid].upper)
-			low = mid + 1;
+	struct rb_node **n, *parent = NULL;
+	struct service_range *sr, *tmp;
+
+	n = &sc->ranges.rb_node;
+	while (*n) {
+		tmp = container_of(*n, struct service_range, tree_node);
+		parent = *n;
+		tmp = container_of(parent, struct service_range, tree_node);
+		if (lower < tmp->lower)
+			n = &(*n)->rb_left;
+		else if (upper > tmp->upper)
+			n = &(*n)->rb_right;
 		else
-			return mid;
+			return NULL;
 	}
-	return low;
+	sr = kzalloc(sizeof(*sr), GFP_ATOMIC);
+	if (!sr)
+		return NULL;
+	sr->lower = lower;
+	sr->upper = upper;
+	INIT_LIST_HEAD(&sr->local_publ);
+	INIT_LIST_HEAD(&sr->all_publ);
+	rb_link_node(&sr->tree_node, parent, n);
+	rb_insert_color(&sr->tree_node, &sc->ranges);
+	return sr;
 }
 
-/**
- * tipc_nameseq_insert_publ
- */
-static struct publication *tipc_nameseq_insert_publ(struct net *net,
-						    struct name_seq *nseq,
+static struct publication *tipc_service_insert_publ(struct net *net,
+						    struct tipc_service *sc,
 						    u32 type, u32 lower,
 						    u32 upper, u32 scope,
-						    u32 node, u32 port, u32 key)
+						    u32 node, u32 port,
+						    u32 key)
 {
-	struct tipc_subscription *s;
-	struct tipc_subscription *st;
-	struct publication *publ;
-	struct sub_seq *sseq;
-	struct name_info *info;
-	int created_subseq = 0;
-
-	sseq = nameseq_find_subseq(nseq, lower);
-	if (sseq) {
-
-		/* Lower end overlaps existing entry => need an exact match */
-		if ((sseq->lower != lower) || (sseq->upper != upper)) {
-			return NULL;
-		}
-
-		info = sseq->info;
-
-		/* Check if an identical publication already exists */
-		list_for_each_entry(publ, &info->all_publ, all_publ) {
-			if (publ->port == port && publ->key == key &&
-			    (!publ->node || publ->node == node))
-				return NULL;
-		}
-	} else {
-		u32 inspos;
-		struct sub_seq *freesseq;
-
-		/* Find where lower end should be inserted */
-		inspos = nameseq_locate_subseq(nseq, lower);
-
-		/* Fail if upper end overlaps into an existing entry */
-		if ((inspos < nseq->first_free) &&
-		    (upper >= nseq->sseqs[inspos].lower)) {
-			return NULL;
-		}
-
-		/* Ensure there is space for new sub-sequence */
-		if (nseq->first_free == nseq->alloc) {
-			struct sub_seq *sseqs = tipc_subseq_alloc(nseq->alloc * 2);
-
-			if (!sseqs) {
-				pr_warn("Cannot publish {%u,%u,%u}, no memory\n",
-					type, lower, upper);
-				return NULL;
-			}
-			memcpy(sseqs, nseq->sseqs,
-			       nseq->alloc * sizeof(struct sub_seq));
-			kfree(nseq->sseqs);
-			nseq->sseqs = sseqs;
-			nseq->alloc *= 2;
-		}
-
-		info = kzalloc(sizeof(*info), GFP_ATOMIC);
-		if (!info) {
-			pr_warn("Cannot publish {%u,%u,%u}, no memory\n",
-				type, lower, upper);
-			return NULL;
-		}
-
-		INIT_LIST_HEAD(&info->local_publ);
-		INIT_LIST_HEAD(&info->all_publ);
-
-		/* Insert new sub-sequence */
-		sseq = &nseq->sseqs[inspos];
-		freesseq = &nseq->sseqs[nseq->first_free];
-		memmove(sseq + 1, sseq, (freesseq - sseq) * sizeof(*sseq));
-		memset(sseq, 0, sizeof(*sseq));
-		nseq->first_free++;
-		sseq->lower = lower;
-		sseq->upper = upper;
-		sseq->info = info;
-		created_subseq = 1;
+	struct tipc_subscription *sub, *tmp;
+	struct service_range *sr;
+	struct publication *p;
+	bool first = false;
+
+	sr = tipc_service_find_range(sc, lower);
+	if (!sr) {
+		sr = tipc_service_create_range(sc, lower, upper);
+		if (!sr)
+			goto  err;
+		first = true;
 	}
 
-	/* Insert a publication */
-	publ = publ_create(type, lower, upper, scope, node, port, key);
-	if (!publ)
+	/* Lower end overlaps existing entry, but we need an exact match */
+	if (sr->lower != lower || sr->upper != upper)
 		return NULL;
 
-	list_add(&publ->all_publ, &info->all_publ);
+	/* Return if the publication already exists */
+	list_for_each_entry(p, &sr->all_publ, all_publ) {
+		if (p->key == key && (!p->node || p->node == node))
+			return NULL;
+	}
 
+	/* Create and insert publication */
+	p = tipc_publ_create(type, lower, upper, scope, node, port, key);
+	if (!p)
+		goto err;
 	if (in_own_node(net, node))
-		list_add(&publ->local_publ, &info->local_publ);
+		list_add(&p->local_publ, &sr->local_publ);
+	list_add(&p->all_publ, &sr->all_publ);
 
 	/* Any subscriptions waiting for notification?  */
-	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
-		tipc_sub_report_overlap(s, publ->lower, publ->upper,
-					TIPC_PUBLISHED, publ->port,
-					publ->node, publ->scope,
-					created_subseq);
+	list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
+		tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_PUBLISHED,
+					p->port, p->node, p->scope, first);
 	}
-	return publ;
+	return p;
+err:
+	pr_warn("Failed to bind to %u,%u,%u, no memory\n", type, lower, upper);
+	return NULL;
 }
 
 /**
- * tipc_nameseq_remove_publ
+ * tipc_service_remove_publ - remove a publication from a service
  *
  * NOTE: There may be cases where TIPC is asked to remove a publication
  * that is not in the name table.  For example, if another node issues a
- * publication for a name sequence that overlaps an existing name sequence
+ * publication for a name range that overlaps an existing name range
  * the publication will not be recorded, which means the publication won't
- * be found when the name sequence is later withdrawn by that node.
+ * be found when the name range is later withdrawn by that node.
  * A failed withdraw request simply returns a failure indication and lets the
  * caller issue any error or warning messages associated with such a problem.
  */
-static struct publication *tipc_nameseq_remove_publ(struct net *net,
-						    struct name_seq *nseq,
+static struct publication *tipc_service_remove_publ(struct net *net,
+						    struct tipc_service *sc,
 						    u32 inst, u32 node,
 						    u32 port, u32 key)
 {
-	struct publication *publ;
-	struct sub_seq *sseq = nameseq_find_subseq(nseq, inst);
-	struct name_info *info;
-	struct sub_seq *free;
-	struct tipc_subscription *s, *st;
-	int removed_subseq = 0;
-
-	if (!sseq)
-		return NULL;
+	struct tipc_subscription *sub, *tmp;
+	struct service_range *sr;
+	struct publication *p;
+	bool found = false;
+	bool last = false;
 
-	info = sseq->info;
+	sr = tipc_service_find_range(sc, inst);
+	if (!sr)
+		return NULL;
 
-	/* Locate publication, if it exists */
-	list_for_each_entry(publ, &info->all_publ, all_publ) {
-		if (publ->key == key && publ->port == port &&
-		    (!publ->node || publ->node == node))
-			goto found;
+	/* Find publication, if it exists */
+	list_for_each_entry(p, &sr->all_publ, all_publ) {
+		if (p->key != key || (node && node != p->node))
+			continue;
+		found = true;
+		break;
 	}
-	return NULL;
+	if (!found)
+		return NULL;
 
-found:
-	list_del(&publ->all_publ);
-	if (in_own_node(net, node))
-		list_del(&publ->local_publ);
-
-	/* Contract subseq list if no more publications for that subseq */
-	if (list_empty(&info->all_publ)) {
-		kfree(info);
-		free = &nseq->sseqs[nseq->first_free--];
-		memmove(sseq, sseq + 1, (free - (sseq + 1)) * sizeof(*sseq));
-		removed_subseq = 1;
+	list_del(&p->all_publ);
+	list_del(&p->local_publ);
+
+	/* Remove service range item if this was its last publication */
+	if (list_empty(&sr->all_publ)) {
+		last = true;
+		rb_erase(&sr->tree_node, &sc->ranges);
+		kfree(sr);
 	}
 
 	/* Notify any waiting subscriptions */
-	list_for_each_entry_safe(s, st, &nseq->subscriptions, nameseq_list) {
-		tipc_sub_report_overlap(s, publ->lower, publ->upper,
-					TIPC_WITHDRAWN, publ->port,
-					publ->node, publ->scope,
-					removed_subseq);
+	list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) {
+		tipc_sub_report_overlap(sub, p->lower, p->upper, TIPC_WITHDRAWN,
+					p->port, p->node, p->scope, last);
 	}
-
-	return publ;
+	return p;
 }
 
 /**
- * tipc_nameseq_subscribe - attach a subscription, and optionally
- * issue the prescribed number of events if there is any sub-
- * sequence overlapping with the requested sequence
+ * tipc_service_subscribe - attach a subscription, and optionally
+ * issue the prescribed number of events if there is any service
+ * range overlapping with the requested range
  */
-static void tipc_nameseq_subscribe(struct name_seq *nseq,
+static void tipc_service_subscribe(struct tipc_service *service,
 				   struct tipc_subscription *sub)
 {
-	struct sub_seq *sseq = nseq->sseqs;
+	struct tipc_subscr *sb = &sub->evt.s;
+	struct service_range *sr;
 	struct tipc_name_seq ns;
-	struct tipc_subscr *s = &sub->evt.s;
-	bool no_status;
+	struct publication *p;
+	struct rb_node *n;
+	bool first;
 
-	ns.type = tipc_sub_read(s, seq.type);
-	ns.lower = tipc_sub_read(s, seq.lower);
-	ns.upper = tipc_sub_read(s, seq.upper);
-	no_status = tipc_sub_read(s, filter) & TIPC_SUB_NO_STATUS;
+	ns.type = tipc_sub_read(sb, seq.type);
+	ns.lower = tipc_sub_read(sb, seq.lower);
+	ns.upper = tipc_sub_read(sb, seq.upper);
 
 	tipc_sub_get(sub);
-	list_add(&sub->nameseq_list, &nseq->subscriptions);
+	list_add(&sub->service_list, &service->subscriptions);
 
-	if (no_status || !sseq)
+	if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS)
 		return;
 
-	while (sseq != &nseq->sseqs[nseq->first_free]) {
-		if (tipc_sub_check_overlap(&ns, sseq->lower, sseq->upper)) {
-			struct publication *crs;
-			struct name_info *info = sseq->info;
-			int must_report = 1;
-
-			list_for_each_entry(crs, &info->all_publ, all_publ) {
-				tipc_sub_report_overlap(sub, sseq->lower,
-							sseq->upper,
-							TIPC_PUBLISHED,
-							crs->port,
-							crs->node,
-							crs->scope,
-							must_report);
-				must_report = 0;
-			}
+	for (n = rb_first(&service->ranges); n; n = rb_next(n)) {
+		sr = container_of(n, struct service_range, tree_node);
+		if (sr->lower > ns.upper)
+			break;
+		if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper))
+			continue;
+		first = true;
+
+		list_for_each_entry(p, &sr->all_publ, all_publ) {
+			tipc_sub_report_overlap(sub, sr->lower, sr->upper,
+						TIPC_PUBLISHED,	p->port,
+						p->node, p->scope, first);
+			first = false;
 		}
-		sseq++;
 	}
 }
 
-static struct name_seq *nametbl_find_seq(struct net *net, u32 type)
+static struct tipc_service *tipc_service_find(struct net *net, u32 type)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct hlist_head *seq_head;
-	struct name_seq *ns;
-
-	seq_head = &tn->nametbl->seq_hlist[hash(type)];
-	hlist_for_each_entry_rcu(ns, seq_head, ns_list) {
-		if (ns->type == type)
-			return ns;
+	struct name_table *nt = tipc_name_table(net);
+	struct hlist_head *service_head;
+	struct tipc_service *service;
+
+	service_head = &nt->services[hash(type)];
+	hlist_for_each_entry_rcu(service, service_head, service_list) {
+		if (service->type == type)
+			return service;
 	}
-
 	return NULL;
 };
 
 struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
-					     u32 lower, u32 upper, u32 scope,
-					     u32 node, u32 port, u32 key)
+					     u32 lower, u32 upper,
+					     u32 scope, u32 node,
+					     u32 port, u32 key)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct publication *publ;
-	struct name_seq *seq = nametbl_find_seq(net, type);
-	int index = hash(type);
+	struct name_table *nt = tipc_name_table(net);
+	struct tipc_service *sc;
+	struct publication *p;
 
 	if (scope > TIPC_NODE_SCOPE || lower > upper) {
-		pr_debug("Failed to publish illegal {%u,%u,%u} with scope %u\n",
+		pr_debug("Failed to bind illegal {%u,%u,%u} with scope %u\n",
 			 type, lower, upper, scope);
 		return NULL;
 	}
-
-	if (!seq)
-		seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
-	if (!seq)
+	sc = tipc_service_find(net, type);
+	if (!sc)
+		sc = tipc_service_create(type, &nt->services[hash(type)]);
+	if (!sc)
 		return NULL;
 
-	spin_lock_bh(&seq->lock);
-	publ = tipc_nameseq_insert_publ(net, seq, type, lower, upper,
-					scope, node, port, key);
-	spin_unlock_bh(&seq->lock);
-	return publ;
+	spin_lock_bh(&sc->lock);
+	p = tipc_service_insert_publ(net, sc, type, lower, upper,
+				     scope, node, port, key);
+	spin_unlock_bh(&sc->lock);
+	return p;
 }
 
 struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
-					     u32 lower, u32 node, u32 port,
-					     u32 key)
+						 u32 lower, u32 node, u32 port,
+						 u32 key)
 {
-	struct publication *publ;
-	struct name_seq *seq = nametbl_find_seq(net, type);
+	struct tipc_service *sc = tipc_service_find(net, type);
+	struct publication *p = NULL;
 
-	if (!seq)
+	if (!sc)
 		return NULL;
 
-	spin_lock_bh(&seq->lock);
-	publ = tipc_nameseq_remove_publ(net, seq, lower, node, port, key);
-	if (!seq->first_free && list_empty(&seq->subscriptions)) {
-		hlist_del_init_rcu(&seq->ns_list);
-		kfree(seq->sseqs);
-		spin_unlock_bh(&seq->lock);
-		kfree_rcu(seq, rcu);
-		return publ;
+	spin_lock_bh(&sc->lock);
+	p = tipc_service_remove_publ(net, sc, lower, node, port, key);
+
+	/* Delete service item if this no more publications and subscriptions */
+	if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
+		hlist_del_init_rcu(&sc->service_list);
+		kfree_rcu(sc, rcu);
 	}
-	spin_unlock_bh(&seq->lock);
-	return publ;
+	spin_unlock_bh(&sc->lock);
+	return p;
 }
 
 /**
- * tipc_nametbl_translate - perform name translation
+ * tipc_nametbl_translate - perform service instance to socket translation
  *
  * On entry, 'destnode' is the search domain used during translation.
  *
@@ -492,7 +405,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
  * - if name translation is deferred to another node/cluster/zone,
  *   leaves 'destnode' unchanged (will be non-zero) and returns 0
  * - if name translation is attempted and succeeds, sets 'destnode'
- *   to publishing node and returns port reference (will be non-zero)
+ *   to publication node and returns port reference (will be non-zero)
  * - if name translation is attempted and fails, sets 'destnode' to 0
  *   and returns 0
  */
@@ -502,10 +415,9 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 	struct tipc_net *tn = tipc_net(net);
 	bool legacy = tn->legacy_addr_format;
 	u32 self = tipc_own_addr(net);
-	struct sub_seq *sseq;
-	struct name_info *info;
-	struct publication *publ;
-	struct name_seq *seq;
+	struct service_range *sr;
+	struct tipc_service *sc;
+	struct publication *p;
 	u32 port = 0;
 	u32 node = 0;
 
@@ -513,49 +425,49 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 		return 0;
 
 	rcu_read_lock();
-	seq = nametbl_find_seq(net, type);
-	if (unlikely(!seq))
+	sc = tipc_service_find(net, type);
+	if (unlikely(!sc))
 		goto not_found;
-	spin_lock_bh(&seq->lock);
-	sseq = nameseq_find_subseq(seq, instance);
-	if (unlikely(!sseq))
+
+	spin_lock_bh(&sc->lock);
+	sr = tipc_service_find_range(sc, instance);
+	if (unlikely(!sr))
 		goto no_match;
-	info = sseq->info;
 
 	/* Closest-First Algorithm */
 	if (legacy && !*destnode) {
-		if (!list_empty(&info->local_publ)) {
-			publ = list_first_entry(&info->local_publ,
-						struct publication,
-						local_publ);
-			list_move_tail(&publ->local_publ,
-				       &info->local_publ);
+		if (!list_empty(&sr->local_publ)) {
+			p = list_first_entry(&sr->local_publ,
+					     struct publication,
+					     local_publ);
+			list_move_tail(&p->local_publ,
+				       &sr->local_publ);
 		} else {
-			publ = list_first_entry(&info->all_publ,
-						struct publication,
-						all_publ);
-			list_move_tail(&publ->all_publ,
-				       &info->all_publ);
+			p = list_first_entry(&sr->all_publ,
+					     struct publication,
+					     all_publ);
+			list_move_tail(&p->all_publ,
+				       &sr->all_publ);
 		}
 	}
 
 	/* Round-Robin Algorithm */
-	else if (*destnode == tipc_own_addr(net)) {
-		if (list_empty(&info->local_publ))
+	else if (*destnode == self) {
+		if (list_empty(&sr->local_publ))
 			goto no_match;
-		publ = list_first_entry(&info->local_publ, struct publication,
-					local_publ);
-		list_move_tail(&publ->local_publ, &info->local_publ);
+		p = list_first_entry(&sr->local_publ, struct publication,
+				     local_publ);
+		list_move_tail(&p->local_publ, &sr->local_publ);
 	} else {
-		publ = list_first_entry(&info->all_publ, struct publication,
-					all_publ);
-		list_move_tail(&publ->all_publ, &info->all_publ);
+		p = list_first_entry(&sr->all_publ, struct publication,
+				     all_publ);
+		list_move_tail(&p->all_publ, &sr->all_publ);
 	}
 
-	port = publ->port;
-	node = publ->node;
+	port = p->port;
+	node = p->node;
 no_match:
-	spin_unlock_bh(&seq->lock);
+	spin_unlock_bh(&sc->lock);
 not_found:
 	rcu_read_unlock();
 	*destnode = node;
@@ -567,34 +479,36 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope,
 			 bool all)
 {
 	u32 self = tipc_own_addr(net);
-	struct publication *publ;
-	struct name_info *info;
-	struct name_seq *seq;
-	struct sub_seq *sseq;
+	struct service_range *sr;
+	struct tipc_service *sc;
+	struct publication *p;
 
 	*dstcnt = 0;
 	rcu_read_lock();
-	seq = nametbl_find_seq(net, type);
-	if (unlikely(!seq))
+	sc = tipc_service_find(net, type);
+	if (unlikely(!sc))
 		goto exit;
-	spin_lock_bh(&seq->lock);
-	sseq = nameseq_find_subseq(seq, instance);
-	if (likely(sseq)) {
-		info = sseq->info;
-		list_for_each_entry(publ, &info->all_publ, all_publ) {
-			if (publ->scope != scope)
-				continue;
-			if (publ->port == exclude && publ->node == self)
-				continue;
-			tipc_dest_push(dsts, publ->node, publ->port);
-			(*dstcnt)++;
-			if (all)
-				continue;
-			list_move_tail(&publ->all_publ, &info->all_publ);
-			break;
-		}
+
+	spin_lock_bh(&sc->lock);
+
+	sr = tipc_service_find_range(sc, instance);
+	if (!sr)
+		goto no_match;
+
+	list_for_each_entry(p, &sr->all_publ, all_publ) {
+		if (p->scope != scope)
+			continue;
+		if (p->port == exclude && p->node == self)
+			continue;
+		tipc_dest_push(dsts, p->node, p->port);
+		(*dstcnt)++;
+		if (all)
+			continue;
+		list_move_tail(&p->all_publ, &sr->all_publ);
+		break;
 	}
-	spin_unlock_bh(&seq->lock);
+no_match:
+	spin_unlock_bh(&sc->lock);
 exit:
 	rcu_read_unlock();
 	return !list_empty(dsts);
@@ -603,61 +517,64 @@ exit:
 void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper,
 			    u32 scope, bool exact, struct list_head *dports)
 {
-	struct sub_seq *sseq_stop;
-	struct name_info *info;
+	struct service_range *sr;
+	struct tipc_service *sc;
 	struct publication *p;
-	struct name_seq *seq;
-	struct sub_seq *sseq;
+	struct rb_node *n;
 
 	rcu_read_lock();
-	seq = nametbl_find_seq(net, type);
-	if (!seq)
+	sc = tipc_service_find(net, type);
+	if (!sc)
 		goto exit;
 
-	spin_lock_bh(&seq->lock);
-	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
-	sseq_stop = seq->sseqs + seq->first_free;
-	for (; sseq != sseq_stop; sseq++) {
-		if (sseq->lower > upper)
+	spin_lock_bh(&sc->lock);
+
+	for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
+		sr = container_of(n, struct service_range, tree_node);
+		if (sr->upper < lower)
+			continue;
+		if (sr->lower > upper)
 			break;
-		info = sseq->info;
-		list_for_each_entry(p, &info->local_publ, local_publ) {
+		list_for_each_entry(p, &sr->local_publ, local_publ) {
 			if (p->scope == scope || (!exact && p->scope < scope))
 				tipc_dest_push(dports, 0, p->port);
 		}
 	}
-	spin_unlock_bh(&seq->lock);
+	spin_unlock_bh(&sc->lock);
 exit:
 	rcu_read_unlock();
 }
 
 /* tipc_nametbl_lookup_dst_nodes - find broadcast destination nodes
  * - Creates list of nodes that overlap the given multicast address
- * - Determines if any node local ports overlap
+ * - Determines if any node local destinations overlap
  */
 void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower,
 				   u32 upper, struct tipc_nlist *nodes)
 {
-	struct sub_seq *sseq, *stop;
-	struct publication *publ;
-	struct name_info *info;
-	struct name_seq *seq;
+	struct service_range *sr;
+	struct tipc_service *sc;
+	struct publication *p;
+	struct rb_node *n;
 
 	rcu_read_lock();
-	seq = nametbl_find_seq(net, type);
-	if (!seq)
+	sc = tipc_service_find(net, type);
+	if (!sc)
 		goto exit;
 
-	spin_lock_bh(&seq->lock);
-	sseq = seq->sseqs + nameseq_locate_subseq(seq, lower);
-	stop = seq->sseqs + seq->first_free;
-	for (; sseq != stop && sseq->lower <= upper; sseq++) {
-		info = sseq->info;
-		list_for_each_entry(publ, &info->all_publ, all_publ) {
-			tipc_nlist_add(nodes, publ->node);
+	spin_lock_bh(&sc->lock);
+
+	for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
+		sr = container_of(n, struct service_range, tree_node);
+		if (sr->upper < lower)
+			continue;
+		if (sr->lower > upper)
+			break;
+		list_for_each_entry(p, &sr->all_publ, all_publ) {
+			tipc_nlist_add(nodes, p->node);
 		}
 	}
-	spin_unlock_bh(&seq->lock);
+	spin_unlock_bh(&sc->lock);
 exit:
 	rcu_read_unlock();
 }
@@ -667,89 +584,88 @@ exit:
 void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp,
 			      u32 type, u32 scope)
 {
-	struct sub_seq *sseq, *stop;
-	struct name_info *info;
+	struct service_range *sr;
+	struct tipc_service *sc;
 	struct publication *p;
-	struct name_seq *seq;
+	struct rb_node *n;
 
 	rcu_read_lock();
-	seq = nametbl_find_seq(net, type);
-	if (!seq)
+	sc = tipc_service_find(net, type);
+	if (!sc)
 		goto exit;
 
-	spin_lock_bh(&seq->lock);
-	sseq = seq->sseqs;
-	stop = seq->sseqs + seq->first_free;
-	for (; sseq != stop; sseq++) {
-		info = sseq->info;
-		list_for_each_entry(p, &info->all_publ, all_publ) {
+	spin_lock_bh(&sc->lock);
+	for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
+		sr = container_of(n, struct service_range, tree_node);
+		list_for_each_entry(p, &sr->all_publ, all_publ) {
 			if (p->scope != scope)
 				continue;
 			tipc_group_add_member(grp, p->node, p->port, p->lower);
 		}
 	}
-	spin_unlock_bh(&seq->lock);
+	spin_unlock_bh(&sc->lock);
 exit:
 	rcu_read_unlock();
 }
 
-/*
- * tipc_nametbl_publish - add name publication to network name tables
+/* tipc_nametbl_publish - add service binding to name table
  */
 struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
-					 u32 upper, u32 scope, u32 port_ref,
+					 u32 upper, u32 scope, u32 port,
 					 u32 key)
 {
-	struct publication *publ;
-	struct sk_buff *buf = NULL;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct name_table *nt = tipc_name_table(net);
+	struct tipc_net *tn = tipc_net(net);
+	struct publication *p = NULL;
+	struct sk_buff *skb = NULL;
 
 	spin_lock_bh(&tn->nametbl_lock);
-	if (tn->nametbl->local_publ_count >= TIPC_MAX_PUBLICATIONS) {
-		pr_warn("Publication failed, local publication limit reached (%u)\n",
-			TIPC_MAX_PUBLICATIONS);
-		spin_unlock_bh(&tn->nametbl_lock);
-		return NULL;
+
+	if (nt->local_publ_count >= TIPC_MAX_PUBL) {
+		pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL);
+		goto exit;
 	}
 
-	publ = tipc_nametbl_insert_publ(net, type, lower, upper, scope,
-					tipc_own_addr(net), port_ref, key);
-	if (likely(publ)) {
-		tn->nametbl->local_publ_count++;
-		buf = tipc_named_publish(net, publ);
+	p = tipc_nametbl_insert_publ(net, type, lower, upper, scope,
+				     tipc_own_addr(net), port, key);
+	if (p) {
+		nt->local_publ_count++;
+		skb = tipc_named_publish(net, p);
 		/* Any pending external events? */
 		tipc_named_process_backlog(net);
 	}
+exit:
 	spin_unlock_bh(&tn->nametbl_lock);
 
-	if (buf)
-		tipc_node_broadcast(net, buf);
-	return publ;
+	if (skb)
+		tipc_node_broadcast(net, skb);
+	return p;
 }
 
 /**
- * tipc_nametbl_withdraw - withdraw name publication from network name tables
+ * tipc_nametbl_withdraw - withdraw a service binding
  */
-int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port,
-			  u32 key)
+int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
+			  u32 port, u32 key)
 {
-	struct publication *publ;
+	struct name_table *nt = tipc_name_table(net);
+	struct tipc_net *tn = tipc_net(net);
+	u32 self = tipc_own_addr(net);
 	struct sk_buff *skb = NULL;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct publication *p;
 
 	spin_lock_bh(&tn->nametbl_lock);
-	publ = tipc_nametbl_remove_publ(net, type, lower, tipc_own_addr(net),
-					port, key);
-	if (likely(publ)) {
-		tn->nametbl->local_publ_count--;
-		skb = tipc_named_withdraw(net, publ);
+
+	p = tipc_nametbl_remove_publ(net, type, lower, self, port, key);
+	if (p) {
+		nt->local_publ_count--;
+		skb = tipc_named_withdraw(net, p);
 		/* Any pending external events? */
 		tipc_named_process_backlog(net);
-		list_del_init(&publ->binding_sock);
-		kfree_rcu(publ, rcu);
+		list_del_init(&p->binding_sock);
+		kfree_rcu(p, rcu);
 	} else {
-		pr_err("Unable to remove local publication\n"
-		       "(type=%u, lower=%u, port=%u, key=%u)\n",
+		pr_err("Failed to remove local publication {%u,%u,%u}/%u\n",
 		       type, lower, port, key);
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
@@ -766,27 +682,24 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 port,
  */
 void tipc_nametbl_subscribe(struct tipc_subscription *sub)
 {
+	struct name_table *nt = tipc_name_table(sub->net);
 	struct tipc_net *tn = tipc_net(sub->net);
 	struct tipc_subscr *s = &sub->evt.s;
 	u32 type = tipc_sub_read(s, seq.type);
-	int index = hash(type);
-	struct name_seq *seq;
-	struct tipc_name_seq ns;
+	struct tipc_service *sc;
 
 	spin_lock_bh(&tn->nametbl_lock);
-	seq = nametbl_find_seq(sub->net, type);
-	if (!seq)
-		seq = tipc_nameseq_create(type, &tn->nametbl->seq_hlist[index]);
-	if (seq) {
-		spin_lock_bh(&seq->lock);
-		tipc_nameseq_subscribe(seq, sub);
-		spin_unlock_bh(&seq->lock);
+	sc = tipc_service_find(sub->net, type);
+	if (!sc)
+		sc = tipc_service_create(type, &nt->services[hash(type)]);
+	if (sc) {
+		spin_lock_bh(&sc->lock);
+		tipc_service_subscribe(sc, sub);
+		spin_unlock_bh(&sc->lock);
 	} else {
-		ns.type = tipc_sub_read(s, seq.type);
-		ns.lower = tipc_sub_read(s, seq.lower);
-		ns.upper = tipc_sub_read(s, seq.upper);
-		pr_warn("Failed to create subscription for {%u,%u,%u}\n",
-			ns.type, ns.lower, ns.upper);
+		pr_warn("Failed to subscribe for {%u,%u,%u}\n", type,
+			tipc_sub_read(s, seq.lower),
+			tipc_sub_read(s, seq.upper));
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
 }
@@ -796,124 +709,122 @@ void tipc_nametbl_subscribe(struct tipc_subscription *sub)
  */
 void tipc_nametbl_unsubscribe(struct tipc_subscription *sub)
 {
-	struct tipc_subscr *s = &sub->evt.s;
 	struct tipc_net *tn = tipc_net(sub->net);
-	struct name_seq *seq;
+	struct tipc_subscr *s = &sub->evt.s;
 	u32 type = tipc_sub_read(s, seq.type);
+	struct tipc_service *sc;
 
 	spin_lock_bh(&tn->nametbl_lock);
-	seq = nametbl_find_seq(sub->net, type);
-	if (seq != NULL) {
-		spin_lock_bh(&seq->lock);
-		list_del_init(&sub->nameseq_list);
-		tipc_sub_put(sub);
-		if (!seq->first_free && list_empty(&seq->subscriptions)) {
-			hlist_del_init_rcu(&seq->ns_list);
-			kfree(seq->sseqs);
-			spin_unlock_bh(&seq->lock);
-			kfree_rcu(seq, rcu);
-		} else {
-			spin_unlock_bh(&seq->lock);
-		}
+	sc = tipc_service_find(sub->net, type);
+	if (!sc)
+		goto exit;
+
+	spin_lock_bh(&sc->lock);
+	list_del_init(&sub->service_list);
+	tipc_sub_put(sub);
+
+	/* Delete service item if no more publications and subscriptions */
+	if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
+		hlist_del_init_rcu(&sc->service_list);
+		kfree_rcu(sc, rcu);
 	}
+	spin_unlock_bh(&sc->lock);
+exit:
 	spin_unlock_bh(&tn->nametbl_lock);
 }
 
 int tipc_nametbl_init(struct net *net)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct name_table *tipc_nametbl;
+	struct tipc_net *tn = tipc_net(net);
+	struct name_table *nt;
 	int i;
 
-	tipc_nametbl = kzalloc(sizeof(*tipc_nametbl), GFP_ATOMIC);
-	if (!tipc_nametbl)
+	nt = kzalloc(sizeof(*nt), GFP_ATOMIC);
+	if (!nt)
 		return -ENOMEM;
 
 	for (i = 0; i < TIPC_NAMETBL_SIZE; i++)
-		INIT_HLIST_HEAD(&tipc_nametbl->seq_hlist[i]);
+		INIT_HLIST_HEAD(&nt->services[i]);
 
-	INIT_LIST_HEAD(&tipc_nametbl->node_scope);
-	INIT_LIST_HEAD(&tipc_nametbl->cluster_scope);
-	tn->nametbl = tipc_nametbl;
+	INIT_LIST_HEAD(&nt->node_scope);
+	INIT_LIST_HEAD(&nt->cluster_scope);
+	tn->nametbl = nt;
 	spin_lock_init(&tn->nametbl_lock);
 	return 0;
 }
 
 /**
- * tipc_purge_publications - remove all publications for a given type
- *
- * tipc_nametbl_lock must be held when calling this function
+ *  tipc_service_delete - purge all publications for a service and delete it
  */
-static void tipc_purge_publications(struct net *net, struct name_seq *seq)
+static void tipc_service_delete(struct net *net, struct tipc_service *sc)
 {
-	struct publication *publ, *safe;
-	struct sub_seq *sseq;
-	struct name_info *info;
-
-	spin_lock_bh(&seq->lock);
-	sseq = seq->sseqs;
-	info = sseq->info;
-	list_for_each_entry_safe(publ, safe, &info->all_publ, all_publ) {
-		tipc_nameseq_remove_publ(net, seq, publ->lower, publ->node,
-					 publ->port, publ->key);
-		kfree_rcu(publ, rcu);
+	struct service_range *sr, *tmpr;
+	struct publication *p, *tmpb;
+
+	spin_lock_bh(&sc->lock);
+	rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
+		list_for_each_entry_safe(p, tmpb,
+					 &sr->all_publ, all_publ) {
+			tipc_service_remove_publ(net, sc, p->lower, p->node,
+						 p->port, p->key);
+			kfree_rcu(p, rcu);
+		}
 	}
-	hlist_del_init_rcu(&seq->ns_list);
-	kfree(seq->sseqs);
-	spin_unlock_bh(&seq->lock);
-
-	kfree_rcu(seq, rcu);
+	hlist_del_init_rcu(&sc->service_list);
+	spin_unlock_bh(&sc->lock);
+	kfree_rcu(sc, rcu);
 }
 
 void tipc_nametbl_stop(struct net *net)
 {
+	struct name_table *nt = tipc_name_table(net);
+	struct tipc_net *tn = tipc_net(net);
+	struct hlist_head *service_head;
+	struct tipc_service *service;
 	u32 i;
-	struct name_seq *seq;
-	struct hlist_head *seq_head;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct name_table *tipc_nametbl = tn->nametbl;
 
 	/* Verify name table is empty and purge any lingering
 	 * publications, then release the name table
 	 */
 	spin_lock_bh(&tn->nametbl_lock);
 	for (i = 0; i < TIPC_NAMETBL_SIZE; i++) {
-		if (hlist_empty(&tipc_nametbl->seq_hlist[i]))
+		if (hlist_empty(&nt->services[i]))
 			continue;
-		seq_head = &tipc_nametbl->seq_hlist[i];
-		hlist_for_each_entry_rcu(seq, seq_head, ns_list) {
-			tipc_purge_publications(net, seq);
+		service_head = &nt->services[i];
+		hlist_for_each_entry_rcu(service, service_head, service_list) {
+			tipc_service_delete(net, service);
 		}
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
 
 	synchronize_net();
-	kfree(tipc_nametbl);
-
+	kfree(nt);
 }
 
 static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
-					struct name_seq *seq,
-					struct sub_seq *sseq, u32 *last_publ)
+					struct tipc_service *service,
+					struct service_range *sr,
+					u32 *last_key)
 {
-	void *hdr;
-	struct nlattr *attrs;
-	struct nlattr *publ;
 	struct publication *p;
+	struct nlattr *attrs;
+	struct nlattr *b;
+	void *hdr;
 
-	if (*last_publ) {
-		list_for_each_entry(p, &sseq->info->all_publ, all_publ)
-			if (p->key == *last_publ)
+	if (*last_key) {
+		list_for_each_entry(p, &sr->all_publ, all_publ)
+			if (p->key == *last_key)
 				break;
-		if (p->key != *last_publ)
+		if (p->key != *last_key)
 			return -EPIPE;
 	} else {
-		p = list_first_entry(&sseq->info->all_publ, struct publication,
+		p = list_first_entry(&sr->all_publ,
+				     struct publication,
 				     all_publ);
 	}
 
-	list_for_each_entry_from(p, &sseq->info->all_publ, all_publ) {
-		*last_publ = p->key;
+	list_for_each_entry_from(p, &sr->all_publ, all_publ) {
+		*last_key = p->key;
 
 		hdr = genlmsg_put(msg->skb, msg->portid, msg->seq,
 				  &tipc_genl_family, NLM_F_MULTI,
@@ -925,15 +836,15 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
 		if (!attrs)
 			goto msg_full;
 
-		publ = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL);
-		if (!publ)
+		b = nla_nest_start(msg->skb, TIPC_NLA_NAME_TABLE_PUBL);
+		if (!b)
 			goto attr_msg_full;
 
-		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, seq->type))
+		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type))
 			goto publ_msg_full;
-		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sseq->lower))
+		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower))
 			goto publ_msg_full;
-		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sseq->upper))
+		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper))
 			goto publ_msg_full;
 		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope))
 			goto publ_msg_full;
@@ -944,16 +855,16 @@ static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
 		if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key))
 			goto publ_msg_full;
 
-		nla_nest_end(msg->skb, publ);
+		nla_nest_end(msg->skb, b);
 		nla_nest_end(msg->skb, attrs);
 		genlmsg_end(msg->skb, hdr);
 	}
-	*last_publ = 0;
+	*last_key = 0;
 
 	return 0;
 
 publ_msg_full:
-	nla_nest_cancel(msg->skb, publ);
+	nla_nest_cancel(msg->skb, b);
 attr_msg_full:
 	nla_nest_cancel(msg->skb, attrs);
 msg_full:
@@ -962,39 +873,34 @@ msg_full:
 	return -EMSGSIZE;
 }
 
-static int __tipc_nl_subseq_list(struct tipc_nl_msg *msg, struct name_seq *seq,
-				 u32 *last_lower, u32 *last_publ)
+static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg,
+					struct tipc_service *sc,
+					u32 *last_lower, u32 *last_key)
 {
-	struct sub_seq *sseq;
-	struct sub_seq *sseq_start;
+	struct service_range *sr;
+	struct rb_node *n;
 	int err;
 
-	if (*last_lower) {
-		sseq_start = nameseq_find_subseq(seq, *last_lower);
-		if (!sseq_start)
-			return -EPIPE;
-	} else {
-		sseq_start = seq->sseqs;
-	}
-
-	for (sseq = sseq_start; sseq != &seq->sseqs[seq->first_free]; sseq++) {
-		err = __tipc_nl_add_nametable_publ(msg, seq, sseq, last_publ);
+	for (n = rb_first(&sc->ranges); n; n = rb_next(n)) {
+		sr = container_of(n, struct service_range, tree_node);
+		if (sr->lower < *last_lower)
+			continue;
+		err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key);
 		if (err) {
-			*last_lower = sseq->lower;
+			*last_lower = sr->lower;
 			return err;
 		}
 	}
 	*last_lower = 0;
-
 	return 0;
 }
 
-static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg,
-			    u32 *last_type, u32 *last_lower, u32 *last_publ)
+static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg,
+				u32 *last_type, u32 *last_lower, u32 *last_key)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	struct hlist_head *seq_head;
-	struct name_seq *seq = NULL;
+	struct tipc_net *tn = tipc_net(net);
+	struct tipc_service *service = NULL;
+	struct hlist_head *head;
 	int err;
 	int i;
 
@@ -1004,30 +910,31 @@ static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg,
 		i = 0;
 
 	for (; i < TIPC_NAMETBL_SIZE; i++) {
-		seq_head = &tn->nametbl->seq_hlist[i];
+		head = &tn->nametbl->services[i];
 
 		if (*last_type) {
-			seq = nametbl_find_seq(net, *last_type);
-			if (!seq)
+			service = tipc_service_find(net, *last_type);
+			if (!service)
 				return -EPIPE;
 		} else {
-			hlist_for_each_entry_rcu(seq, seq_head, ns_list)
+			hlist_for_each_entry_rcu(service, head, service_list)
 				break;
-			if (!seq)
+			if (!service)
 				continue;
 		}
 
-		hlist_for_each_entry_from_rcu(seq, ns_list) {
-			spin_lock_bh(&seq->lock);
-			err = __tipc_nl_subseq_list(msg, seq, last_lower,
-						    last_publ);
+		hlist_for_each_entry_from_rcu(service, service_list) {
+			spin_lock_bh(&service->lock);
+			err = __tipc_nl_service_range_list(msg, service,
+							   last_lower,
+							   last_key);
 
 			if (err) {
-				*last_type = seq->type;
-				spin_unlock_bh(&seq->lock);
+				*last_type = service->type;
+				spin_unlock_bh(&service->lock);
 				return err;
 			}
-			spin_unlock_bh(&seq->lock);
+			spin_unlock_bh(&service->lock);
 		}
 		*last_type = 0;
 	}
@@ -1036,13 +943,13 @@ static int tipc_nl_seq_list(struct net *net, struct tipc_nl_msg *msg,
 
 int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int err;
-	int done = cb->args[3];
+	struct net *net = sock_net(skb->sk);
 	u32 last_type = cb->args[0];
 	u32 last_lower = cb->args[1];
-	u32 last_publ = cb->args[2];
-	struct net *net = sock_net(skb->sk);
+	u32 last_key = cb->args[2];
+	int done = cb->args[3];
 	struct tipc_nl_msg msg;
+	int err;
 
 	if (done)
 		return 0;
@@ -1052,7 +959,8 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	msg.seq = cb->nlh->nlmsg_seq;
 
 	rcu_read_lock();
-	err = tipc_nl_seq_list(net, &msg, &last_type, &last_lower, &last_publ);
+	err = tipc_nl_service_list(net, &msg, &last_type,
+				   &last_lower, &last_key);
 	if (!err) {
 		done = 1;
 	} else if (err != -EMSGSIZE) {
@@ -1068,7 +976,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 	cb->args[0] = last_type;
 	cb->args[1] = last_lower;
-	cb->args[2] = last_publ;
+	cb->args[2] = last_key;
 	cb->args[3] = done;
 
 	return skb->len;
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 34a4ccb907aa..1b03b8751707 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -97,7 +97,7 @@ struct publication {
  * @local_publ_count: number of publications issued by this node
  */
 struct name_table {
-	struct hlist_head seq_hlist[TIPC_NAMETBL_SIZE];
+	struct hlist_head services[TIPC_NAMETBL_SIZE];
 	struct list_head node_scope;
 	struct list_head cluster_scope;
 	u32 local_publ_count;
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 4fb4327311bb..85e777e00ec9 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -324,12 +324,12 @@ static void tipc_node_write_unlock(struct tipc_node *n)
 	if (flags & TIPC_NOTIFY_LINK_UP) {
 		tipc_mon_peer_up(net, addr, bearer_id);
 		tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
-				     TIPC_NODE_SCOPE, link_id, addr);
+				     TIPC_NODE_SCOPE, link_id, link_id);
 	}
 	if (flags & TIPC_NOTIFY_LINK_DOWN) {
 		tipc_mon_peer_down(net, addr, bearer_id);
 		tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
-				      link_id, addr);
+				      link_id, link_id);
 	}
 }
 
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 8b2d22b18f22..d793b4343885 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -40,7 +40,7 @@
 #include "topsrv.h"
 
 #define TIPC_MAX_SUBSCR         65535
-#define TIPC_MAX_PUBLICATIONS   65535
+#define TIPC_MAX_PUBL           65535
 
 struct tipc_subscription;
 struct tipc_conn;
@@ -58,7 +58,7 @@ struct tipc_subscription {
 	struct kref kref;
 	struct net *net;
 	struct timer_list timer;
-	struct list_head nameseq_list;
+	struct list_head service_list;
 	struct list_head sub_list;
 	struct tipc_event evt;
 	int conid;
-- 
cgit v1.2.3


From f20889f72bd531cad88fbb571755a52cabf43424 Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 29 Mar 2018 23:20:42 +0200
Subject: tipc: refactor name table translate function

The function tipc_nametbl_translate() function is ugly and hard to
follow. This can be improved somewhat by introducing a stack variable
for holding the publication list to be used and re-ordering the if-
clauses for selection of algorithm.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_table.c | 61 +++++++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 36 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index e06c7a8907aa..4bdc580c533b 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -399,29 +399,32 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 /**
  * tipc_nametbl_translate - perform service instance to socket translation
  *
- * On entry, 'destnode' is the search domain used during translation.
+ * On entry, 'dnode' is the search domain used during translation.
  *
  * On exit:
- * - if name translation is deferred to another node/cluster/zone,
- *   leaves 'destnode' unchanged (will be non-zero) and returns 0
- * - if name translation is attempted and succeeds, sets 'destnode'
- *   to publication node and returns port reference (will be non-zero)
- * - if name translation is attempted and fails, sets 'destnode' to 0
- *   and returns 0
+ * - if translation is deferred to another node, leave 'dnode' unchanged and
+ *   return 0
+ * - if translation is attempted and succeeds, set 'dnode' to the publishing
+ *   node and return the published (non-zero) port number
+ * - if translation is attempted and fails, set 'dnode' to 0 and return 0
+ *
+ * Note that for legacy users (node configured with Z.C.N address format) the
+ * 'closest-first' lookup algorithm must be maintained, i.e., if dnode is 0
+ * we must look in the local binding list first
  */
-u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
-			   u32 *destnode)
+u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode)
 {
 	struct tipc_net *tn = tipc_net(net);
 	bool legacy = tn->legacy_addr_format;
 	u32 self = tipc_own_addr(net);
 	struct service_range *sr;
 	struct tipc_service *sc;
+	struct list_head *list;
 	struct publication *p;
 	u32 port = 0;
 	u32 node = 0;
 
-	if (!tipc_in_scope(legacy, *destnode, self))
+	if (!tipc_in_scope(legacy, *dnode, self))
 		return 0;
 
 	rcu_read_lock();
@@ -434,43 +437,29 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
 	if (unlikely(!sr))
 		goto no_match;
 
-	/* Closest-First Algorithm */
-	if (legacy && !*destnode) {
-		if (!list_empty(&sr->local_publ)) {
-			p = list_first_entry(&sr->local_publ,
-					     struct publication,
-					     local_publ);
-			list_move_tail(&p->local_publ,
-				       &sr->local_publ);
-		} else {
-			p = list_first_entry(&sr->all_publ,
-					     struct publication,
-					     all_publ);
-			list_move_tail(&p->all_publ,
-				       &sr->all_publ);
-		}
-	}
-
-	/* Round-Robin Algorithm */
-	else if (*destnode == self) {
-		if (list_empty(&sr->local_publ))
+	/* Select lookup algorithm: local, closest-first or round-robin */
+	if (*dnode == self) {
+		list = &sr->local_publ;
+		if (list_empty(list))
 			goto no_match;
-		p = list_first_entry(&sr->local_publ, struct publication,
-				     local_publ);
+		p = list_first_entry(list, struct publication, local_publ);
+		list_move_tail(&p->local_publ, &sr->local_publ);
+	} else if (legacy && !*dnode && !list_empty(&sr->local_publ)) {
+		list = &sr->local_publ;
+		p = list_first_entry(list, struct publication, local_publ);
 		list_move_tail(&p->local_publ, &sr->local_publ);
 	} else {
-		p = list_first_entry(&sr->all_publ, struct publication,
-				     all_publ);
+		list = &sr->all_publ;
+		p = list_first_entry(list, struct publication, all_publ);
 		list_move_tail(&p->all_publ, &sr->all_publ);
 	}
-
 	port = p->port;
 	node = p->node;
 no_match:
 	spin_unlock_bh(&sc->lock);
 not_found:
 	rcu_read_unlock();
-	*destnode = node;
+	*dnode = node;
 	return port;
 }
 
-- 
cgit v1.2.3


From 37922ea4a3105176357c8d565a9d982c4a08714a Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 29 Mar 2018 23:20:43 +0200
Subject: tipc: permit overlapping service ranges in name table

With the new RB tree structure for service ranges it becomes possible to
solve an old problem; - we can now allow overlapping service ranges in
the table.

When inserting a new service range to the tree, we use 'lower' as primary
key, and when necessary 'upper' as secondary key.

Since there may now be multiple service ranges matching an indicated
'lower' value, we must also add the 'upper' value to the functions
used for removing publications, so that the correct, corresponding
range item can be found.

These changes guarantee that a well-formed publication/withdrawal item
from a peer node never will be rejected, and make it possible to
eliminate the problematic backlog functionality we currently have for
handling such cases.

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/name_distr.c | 90 +++++++++++++--------------------------------------
 net/tipc/name_distr.h |  1 -
 net/tipc/name_table.c | 64 +++++++++++++++++-------------------
 net/tipc/name_table.h |  8 ++---
 net/tipc/net.c        |  2 +-
 net/tipc/node.c       |  2 +-
 net/tipc/socket.c     |  4 +--
 7 files changed, 60 insertions(+), 111 deletions(-)

(limited to 'net')

diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 8240a85b0d0c..51b4b96f89db 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -204,12 +204,12 @@ void tipc_named_node_up(struct net *net, u32 dnode)
  */
 static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
 {
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
+	struct tipc_net *tn = tipc_net(net);
 	struct publication *p;
 
 	spin_lock_bh(&tn->nametbl_lock);
-	p = tipc_nametbl_remove_publ(net, publ->type, publ->lower,
-				     publ->node, publ->port, publ->key);
+	p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->upper,
+				     publ->node, publ->key);
 	if (p)
 		tipc_node_unsubscribe(net, &p->binding_node, addr);
 	spin_unlock_bh(&tn->nametbl_lock);
@@ -261,81 +261,37 @@ void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr)
 static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
 				u32 node, u32 dtype)
 {
-	struct publication *publ = NULL;
+	struct publication *p = NULL;
+	u32 lower = ntohl(i->lower);
+	u32 upper = ntohl(i->upper);
+	u32 type = ntohl(i->type);
+	u32 port = ntohl(i->port);
+	u32 key = ntohl(i->key);
 
 	if (dtype == PUBLICATION) {
-		publ = tipc_nametbl_insert_publ(net, ntohl(i->type),
-						ntohl(i->lower),
-						ntohl(i->upper),
-						TIPC_CLUSTER_SCOPE, node,
-						ntohl(i->port), ntohl(i->key));
-		if (publ) {
-			tipc_node_subscribe(net, &publ->binding_node, node);
+		p = tipc_nametbl_insert_publ(net, type, lower, upper,
+					     TIPC_CLUSTER_SCOPE, node,
+					     port, key);
+		if (p) {
+			tipc_node_subscribe(net, &p->binding_node, node);
 			return true;
 		}
 	} else if (dtype == WITHDRAWAL) {
-		publ = tipc_nametbl_remove_publ(net, ntohl(i->type),
-						ntohl(i->lower),
-						node, ntohl(i->port),
-						ntohl(i->key));
-		if (publ) {
-			tipc_node_unsubscribe(net, &publ->binding_node, node);
-			kfree_rcu(publ, rcu);
+		p = tipc_nametbl_remove_publ(net, type, lower,
+					     upper, node, key);
+		if (p) {
+			tipc_node_unsubscribe(net, &p->binding_node, node);
+			kfree_rcu(p, rcu);
 			return true;
 		}
+		pr_warn_ratelimited("Failed to remove binding %u,%u from %x\n",
+				    type, lower, node);
 	} else {
 		pr_warn("Unrecognized name table message received\n");
 	}
 	return false;
 }
 
-/**
- * tipc_named_add_backlog - add a failed name table update to the backlog
- *
- */
-static void tipc_named_add_backlog(struct net *net, struct distr_item *i,
-				   u32 type, u32 node)
-{
-	struct distr_queue_item *e;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	unsigned long now = get_jiffies_64();
-
-	e = kzalloc(sizeof(*e), GFP_ATOMIC);
-	if (!e)
-		return;
-	e->dtype = type;
-	e->node = node;
-	e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout);
-	memcpy(e, i, sizeof(*i));
-	list_add_tail(&e->next, &tn->dist_queue);
-}
-
-/**
- * tipc_named_process_backlog - try to process any pending name table updates
- * from the network.
- */
-void tipc_named_process_backlog(struct net *net)
-{
-	struct distr_queue_item *e, *tmp;
-	struct tipc_net *tn = net_generic(net, tipc_net_id);
-	unsigned long now = get_jiffies_64();
-
-	list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) {
-		if (time_after(e->expires, now)) {
-			if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype))
-				continue;
-		} else {
-			pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %x key=%u\n",
-					    e->dtype, ntohl(e->i.type),
-					    ntohl(e->i.lower),
-					    ntohl(e->i.upper),
-					    e->node, ntohl(e->i.key));
-		}
-		list_del(&e->next);
-		kfree(e);
-	}
-}
-
 /**
  * tipc_named_rcv - process name table update messages sent by another node
  */
@@ -358,12 +314,10 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq)
 		count = msg_data_sz(msg) / ITEM_SIZE;
 		node = msg_orignode(msg);
 		while (count--) {
-			if (!tipc_update_nametbl(net, item, node, mtype))
-				tipc_named_add_backlog(net, item, mtype, node);
+			tipc_update_nametbl(net, item, node, mtype);
 			item++;
 		}
 		kfree_skb(skb);
-		tipc_named_process_backlog(net);
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
 }
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
index 4753e628d7c4..63fc73e0fa6c 100644
--- a/net/tipc/name_distr.h
+++ b/net/tipc/name_distr.h
@@ -72,7 +72,6 @@ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ);
 void tipc_named_node_up(struct net *net, u32 dnode);
 void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue);
 void tipc_named_reinit(struct net *net);
-void tipc_named_process_backlog(struct net *net);
 void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr);
 
 #endif
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 4bdc580c533b..b1fe20972aa9 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -171,10 +171,14 @@ static struct service_range *tipc_service_create_range(struct tipc_service *sc,
 		tmp = container_of(parent, struct service_range, tree_node);
 		if (lower < tmp->lower)
 			n = &(*n)->rb_left;
+		else if (lower > tmp->lower)
+			n = &(*n)->rb_right;
+		else if (upper < tmp->upper)
+			n = &(*n)->rb_left;
 		else if (upper > tmp->upper)
 			n = &(*n)->rb_right;
 		else
-			return NULL;
+			return tmp;
 	}
 	sr = kzalloc(sizeof(*sr), GFP_ATOMIC);
 	if (!sr)
@@ -200,17 +204,11 @@ static struct publication *tipc_service_insert_publ(struct net *net,
 	struct publication *p;
 	bool first = false;
 
-	sr = tipc_service_find_range(sc, lower);
-	if (!sr) {
-		sr = tipc_service_create_range(sc, lower, upper);
-		if (!sr)
-			goto  err;
-		first = true;
-	}
+	sr = tipc_service_create_range(sc, lower, upper);
+	if (!sr)
+		goto  err;
 
-	/* Lower end overlaps existing entry, but we need an exact match */
-	if (sr->lower != lower || sr->upper != upper)
-		return NULL;
+	first = list_empty(&sr->all_publ);
 
 	/* Return if the publication already exists */
 	list_for_each_entry(p, &sr->all_publ, all_publ) {
@@ -239,30 +237,32 @@ err:
 
 /**
  * tipc_service_remove_publ - remove a publication from a service
- *
- * NOTE: There may be cases where TIPC is asked to remove a publication
- * that is not in the name table.  For example, if another node issues a
- * publication for a name range that overlaps an existing name range
- * the publication will not be recorded, which means the publication won't
- * be found when the name range is later withdrawn by that node.
- * A failed withdraw request simply returns a failure indication and lets the
- * caller issue any error or warning messages associated with such a problem.
  */
 static struct publication *tipc_service_remove_publ(struct net *net,
 						    struct tipc_service *sc,
-						    u32 inst, u32 node,
-						    u32 port, u32 key)
+						    u32 lower, u32 upper,
+						    u32 node, u32 key)
 {
 	struct tipc_subscription *sub, *tmp;
 	struct service_range *sr;
 	struct publication *p;
 	bool found = false;
 	bool last = false;
+	struct rb_node *n;
 
-	sr = tipc_service_find_range(sc, inst);
+	sr = tipc_service_find_range(sc, lower);
 	if (!sr)
 		return NULL;
 
+	/* Find exact matching service range */
+	for (n = &sr->tree_node; n; n = rb_next(n)) {
+		sr = container_of(n, struct service_range, tree_node);
+		if (sr->upper == upper)
+			break;
+	}
+	if (!n || sr->lower != lower || sr->upper != upper)
+		return NULL;
+
 	/* Find publication, if it exists */
 	list_for_each_entry(p, &sr->all_publ, all_publ) {
 		if (p->key != key || (node && node != p->node))
@@ -375,8 +375,8 @@ struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
 }
 
 struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
-						 u32 lower, u32 node, u32 port,
-						 u32 key)
+					     u32 lower, u32 upper,
+					     u32 node, u32 key)
 {
 	struct tipc_service *sc = tipc_service_find(net, type);
 	struct publication *p = NULL;
@@ -385,7 +385,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
 		return NULL;
 
 	spin_lock_bh(&sc->lock);
-	p = tipc_service_remove_publ(net, sc, lower, node, port, key);
+	p = tipc_service_remove_publ(net, sc, lower, upper, node, key);
 
 	/* Delete service item if this no more publications and subscriptions */
 	if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) {
@@ -620,8 +620,6 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
 	if (p) {
 		nt->local_publ_count++;
 		skb = tipc_named_publish(net, p);
-		/* Any pending external events? */
-		tipc_named_process_backlog(net);
 	}
 exit:
 	spin_unlock_bh(&tn->nametbl_lock);
@@ -635,7 +633,7 @@ exit:
  * tipc_nametbl_withdraw - withdraw a service binding
  */
 int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
-			  u32 port, u32 key)
+			  u32 upper, u32 key)
 {
 	struct name_table *nt = tipc_name_table(net);
 	struct tipc_net *tn = tipc_net(net);
@@ -645,17 +643,15 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower,
 
 	spin_lock_bh(&tn->nametbl_lock);
 
-	p = tipc_nametbl_remove_publ(net, type, lower, self, port, key);
+	p = tipc_nametbl_remove_publ(net, type, lower, upper, self, key);
 	if (p) {
 		nt->local_publ_count--;
 		skb = tipc_named_withdraw(net, p);
-		/* Any pending external events? */
-		tipc_named_process_backlog(net);
 		list_del_init(&p->binding_sock);
 		kfree_rcu(p, rcu);
 	} else {
 		pr_err("Failed to remove local publication {%u,%u,%u}/%u\n",
-		       type, lower, port, key);
+		       type, lower, upper, key);
 	}
 	spin_unlock_bh(&tn->nametbl_lock);
 
@@ -754,8 +750,8 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc)
 	rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) {
 		list_for_each_entry_safe(p, tmpb,
 					 &sr->all_publ, all_publ) {
-			tipc_service_remove_publ(net, sc, p->lower, p->node,
-						 p->port, p->key);
+			tipc_service_remove_publ(net, sc, p->lower, p->upper,
+						 p->node, p->key);
 			kfree_rcu(p, rcu);
 		}
 	}
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 1b03b8751707..4b14fc28d9e2 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -116,16 +116,16 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain,
 			 struct list_head *dsts, int *dstcnt, u32 exclude,
 			 bool all);
 struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
-					 u32 upper, u32 scope, u32 port_ref,
+					 u32 upper, u32 scope, u32 port,
 					 u32 key);
-int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
+int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 upper,
 			  u32 key);
 struct publication *tipc_nametbl_insert_publ(struct net *net, u32 type,
 					     u32 lower, u32 upper, u32 scope,
 					     u32 node, u32 ref, u32 key);
 struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type,
-					     u32 lower, u32 node, u32 ref,
-					     u32 key);
+					     u32 lower, u32 upper,
+					     u32 node, u32 key);
 void tipc_nametbl_subscribe(struct tipc_subscription *s);
 void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
 int tipc_nametbl_init(struct net *net);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 29538dc00857..856f9e97ea29 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -136,7 +136,7 @@ void tipc_net_stop(struct net *net)
 	if (!self)
 		return;
 
-	tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, 0, self);
+	tipc_nametbl_withdraw(net, TIPC_CFG_SRV, self, self, self);
 	rtnl_lock();
 	tipc_bearer_stop(net);
 	tipc_node_stop(net);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 85e777e00ec9..c77dd2f3c589 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -329,7 +329,7 @@ static void tipc_node_write_unlock(struct tipc_node *n)
 	if (flags & TIPC_NOTIFY_LINK_DOWN) {
 		tipc_mon_peer_down(net, addr, bearer_id);
 		tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
-				      link_id, link_id);
+				      addr, link_id);
 	}
 }
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 275b666f6231..3e5eba30865e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2634,12 +2634,12 @@ static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope,
 			if (publ->upper != seq->upper)
 				break;
 			tipc_nametbl_withdraw(net, publ->type, publ->lower,
-					      publ->port, publ->key);
+					      publ->upper, publ->key);
 			rc = 0;
 			break;
 		}
 		tipc_nametbl_withdraw(net, publ->type, publ->lower,
-				      publ->port, publ->key);
+				      publ->upper, publ->key);
 		rc = 0;
 	}
 	if (list_empty(&tsk->publications))
-- 
cgit v1.2.3


From 7494cfa6d36d1556f17baa012dd93833620783db Mon Sep 17 00:00:00 2001
From: Jon Maloy <jon.maloy@ericsson.com>
Date: Thu, 29 Mar 2018 23:20:45 +0200
Subject: tipc: avoid possible string overflow

gcc points out that the combined length of the fixed-length inputs to
l->name is larger than the destination buffer size:

net/tipc/link.c: In function 'tipc_link_create':
net/tipc/link.c:465:26: error: '%s' directive writing up to 32 bytes
into a region of size between 26 and 58 [-Werror=format-overflow=]
sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);

net/tipc/link.c:465:2: note: 'sprintf' output 11 or more bytes
(assuming 75) into a destination of size 60
sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);

A detailed analysis reveals that the theoretical maximum length of
a link name is:
max self_str + 1 + max if_name + 1 + max peer_str + 1 + max if_name =
16 + 1 + 15 + 1 + 16 + 1 + 15 = 65
Since we also need space for a trailing zero we now set MAX_LINK_NAME
to 68.

Just to be on the safe side we also replace the sprintf() call with
snprintf().

Fixes: 25b0b9c4e835 ("tipc: handle collisions of 32-bit node address
hash values")
Reported-by: Arnd Bergmann <arnd@arndb.de>

Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/link.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 8f2a9496439b..695acb783969 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -462,7 +462,8 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
 			sprintf(peer_str, "%x", peer);
 	}
 	/* Peer i/f name will be completed by reset/activate message */
-	sprintf(l->name, "%s:%s-%s:unknown", self_str, if_name, peer_str);
+	snprintf(l->name, sizeof(l->name), "%s:%s-%s:unknown",
+		 self_str, if_name, peer_str);
 
 	strcpy(l->if_name, if_name);
 	l->addr = peer;
-- 
cgit v1.2.3


From fc1dd36992bb041b4470120aecf8986910c56088 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 30 Mar 2018 19:38:27 +0300
Subject: net: Remove net_rwsem from {, un}register_netdevice_notifier()

These functions take net_rwsem, while wireless_nlevent_flush()
also takes it. But down_read() can't be taken recursive,
because of rw_semaphore design, which prevents it to be occupied
by only readers forever.

Since we take pernet_ops_rwsem in {,un}register_netdevice_notifier(),
net list can't change, so these down_read()/up_read() can be removed.

Fixes: f0b07bb151b0 "net: Introduce net_rwsem to protect net_namespace_list"
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 07da7add4845..8edb58829124 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1633,7 +1633,6 @@ int register_netdevice_notifier(struct notifier_block *nb)
 		goto unlock;
 	if (dev_boot_phase)
 		goto unlock;
-	down_read(&net_rwsem);
 	for_each_net(net) {
 		for_each_netdev(net, dev) {
 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
@@ -1647,7 +1646,6 @@ int register_netdevice_notifier(struct notifier_block *nb)
 			call_netdevice_notifier(nb, NETDEV_UP, dev);
 		}
 	}
-	up_read(&net_rwsem);
 
 unlock:
 	rtnl_unlock();
@@ -1671,7 +1669,6 @@ rollback:
 	}
 
 outroll:
-	up_read(&net_rwsem);
 	raw_notifier_chain_unregister(&netdev_chain, nb);
 	goto unlock;
 }
@@ -1704,7 +1701,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 	if (err)
 		goto unlock;
 
-	down_read(&net_rwsem);
 	for_each_net(net) {
 		for_each_netdev(net, dev) {
 			if (dev->flags & IFF_UP) {
@@ -1715,7 +1711,6 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
 		}
 	}
-	up_read(&net_rwsem);
 unlock:
 	rtnl_unlock();
 	up_write(&pernet_ops_rwsem);
-- 
cgit v1.2.3


From 554873e517115c4b7207259f1cadfd77d90b5395 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 30 Mar 2018 19:38:37 +0300
Subject: net: Do not take net_rwsem in __rtnl_link_unregister()

This function calls call_netdevice_notifier(), which also
may take net_rwsem. So, we can't use net_rwsem here.

This patch makes callers of this functions take pernet_ops_rwsem,
like register_netdevice_notifier() does. This will protect
the modifications of net_namespace_list, and allows notifiers
to take it (they won't have to care about context).

Since __rtnl_link_unregister() is used on module load
and unload (which are not frequent operations), this looks
for me better, than make all call_netdevice_notifier()
always executing in "protected net_namespace_list" context.

Also, this fixes the problem we had a deal in 328fbe747ad4
"Close race between {un, }register_netdevice_notifier and ...",
and guarantees __rtnl_link_unregister() does not skip
exitting net.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 1 +
 net/core/rtnetlink.c     | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7fdf321d4997..a11e03f920d3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -51,6 +51,7 @@ static bool init_net_initialized;
  * outside.
  */
 DECLARE_RWSEM(pernet_ops_rwsem);
+EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
 
 #define MIN_PERNET_OPS_ID	\
 	((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e86b28482ca7..45936922d7e2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -412,17 +412,17 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
  * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
  * @ops: struct rtnl_link_ops * to unregister
  *
- * The caller must hold the rtnl_mutex.
+ * The caller must hold the rtnl_mutex and guarantee net_namespace_list
+ * integrity (hold pernet_ops_rwsem for writing to close the race
+ * with setup_net() and cleanup_net()).
  */
 void __rtnl_link_unregister(struct rtnl_link_ops *ops)
 {
 	struct net *net;
 
-	down_read(&net_rwsem);
 	for_each_net(net) {
 		__rtnl_kill_links(net, ops);
 	}
-	up_read(&net_rwsem);
 	list_del(&ops->list);
 }
 EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
-- 
cgit v1.2.3


From eeb0a2a526d816253705977033e98fdfbab32387 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Sat, 31 Mar 2018 06:11:41 +0000
Subject: vlan: vlan_hw_filter_capable() can be static

Fixes the following sparse warning:

net/8021q/vlan_core.c:168:6: warning:
 symbol 'vlan_hw_filter_capable' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index c8d7abdc0463..4f60e86f4b8d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -165,7 +165,7 @@ struct vlan_vid_info {
 	int refcount;
 };
 
-bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
+static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
 {
 	if (proto == htons(ETH_P_8021Q) &&
 	    dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
-- 
cgit v1.2.3


From 787bea7748a76130566f881c2342a0be4127d182 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:43 -0700
Subject: inet: frags: change inet_frags_init_net() return value

We will soon initialize one rhashtable per struct netns_frags
in inet_frags_init_net().

This patch changes the return value to eventually propagate an
error.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan/reassembly.c     | 11 ++++++++---
 net/ipv4/ip_fragment.c                  | 12 +++++++++---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++++++++---
 net/ipv6/reassembly.c                   | 11 +++++++++--
 4 files changed, 35 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 85bf86ad6b18..2aaab4bba429 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -581,14 +581,19 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
+	int res;
 
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&ieee802154_lowpan->frags);
-
-	return lowpan_frags_ns_sysctl_register(net);
+	res = inet_frags_init_net(&ieee802154_lowpan->frags);
+	if (res < 0)
+		return res;
+	res = lowpan_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	return res;
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbf1b94942c0..e0b39d4ecbd4 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -846,6 +846,8 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
+	int res;
+
 	/* Fragment cache limits.
 	 *
 	 * The fragment memory accounting code, (tries to) account for
@@ -871,9 +873,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
 	net->ipv4.frags.max_dist = 64;
 
-	inet_frags_init_net(&net->ipv4.frags);
-
-	return ip4_frags_ns_ctl_register(net);
+	res = inet_frags_init_net(&net->ipv4.frags);
+	if (res < 0)
+		return res;
+	res = ip4_frags_ns_ctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b84ce3e6d728..6ff41569134a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -629,12 +629,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
+	int res;
+
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	inet_frags_init_net(&net->nf_frag.frags);
-
-	return nf_ct_frag6_sysctl_register(net);
+	res = inet_frags_init_net(&net->nf_frag.frags);
+	if (res < 0)
+		return res;
+	res = nf_ct_frag6_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 08a139f14d0f..a8f7a5f0251a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -711,13 +711,20 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
+	int res;
+
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&net->ipv6.frags);
+	res = inet_frags_init_net(&net->ipv6.frags);
+	if (res < 0)
+		return res;
 
-	return ip6_frags_ns_sysctl_register(net);
+	res = ip6_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
-- 
cgit v1.2.3


From 093ba72914b696521e4885756a68a3332782c8de Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:44 -0700
Subject: inet: frags: add a pointer to struct netns_frags

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */)
ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan/reassembly.c     | 13 +++++++------
 net/ipv4/inet_fragment.c                | 17 ++++++++++-------
 net/ipv4/ip_fragment.c                  |  9 +++++----
 net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +++++++++-------
 net/ipv6/reassembly.c                   | 20 ++++++++++----------
 5 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 2aaab4bba429..6badc055555b 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -94,10 +94,10 @@ static void lowpan_frag_expire(struct timer_list *t)
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, &lowpan_frags);
+	inet_frag_put(&fq->q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -230,7 +230,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 	struct sk_buff *fp, *head = fq->q.fragments;
 	int sum_truesize;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 
 	/* Make the one we just received the head. */
 	if (prev) {
@@ -438,7 +438,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 		ret = lowpan_frag_queue(fq, skb, frag_type);
 		spin_unlock(&fq->q.lock);
 
-		inet_frag_put(&fq->q, &lowpan_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -586,13 +586,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+	ieee802154_lowpan->frags.f = &lowpan_frags;
 
 	res = inet_frags_init_net(&ieee802154_lowpan->frags);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+		inet_frags_exit_net(&ieee802154_lowpan->frags);
 	return res;
 }
 
@@ -602,7 +603,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	inet_frags_exit_net(&ieee802154_lowpan->frags);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e8ec28999f5c..1ac69f65d0de 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+void inet_frags_exit_net(struct netns_frags *nf)
 {
+	struct inet_frags *f =nf->f;
 	unsigned int seq;
 	int i;
 
@@ -264,33 +265,34 @@ __acquires(hb->chain_lock)
 	return hb;
 }
 
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static inline void fq_unlink(struct inet_frag_queue *fq)
 {
 	struct inet_frag_bucket *hb;
 
-	hb = get_frag_bucket_locked(fq, f);
+	hb = get_frag_bucket_locked(fq, fq->net->f);
 	hlist_del(&fq->list);
 	fq->flags |= INET_FRAG_COMPLETE;
 	spin_unlock(&hb->chain_lock);
 }
 
-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+void inet_frag_kill(struct inet_frag_queue *fq)
 {
 	if (del_timer(&fq->timer))
 		refcount_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq, f);
+		fq_unlink(fq);
 		refcount_dec(&fq->refcnt);
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
 
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
+void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct sk_buff *fp;
 	struct netns_frags *nf;
 	unsigned int sum, sum_truesize = 0;
+	struct inet_frags *f;
 
 	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 	WARN_ON(del_timer(&q->timer) != 0);
@@ -298,6 +300,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 	/* Release all fragment data. */
 	fp = q->fragments;
 	nf = q->net;
+	f = nf->f;
 	while (fp) {
 		struct sk_buff *xp = fp->next;
 
@@ -333,7 +336,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 			refcount_inc(&qp->refcnt);
 			spin_unlock(&hb->chain_lock);
 			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in, f);
+			inet_frag_put(qp_in);
 			return qp;
 		}
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e0b39d4ecbd4..cd2b4c9419fc 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -168,7 +168,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 
 static void ipq_put(struct ipq *ipq)
 {
-	inet_frag_put(&ipq->q, &ip4_frags);
+	inet_frag_put(&ipq->q);
 }
 
 /* Kill ipq entry. It is not destroyed immediately,
@@ -176,7 +176,7 @@ static void ipq_put(struct ipq *ipq)
  */
 static void ipq_kill(struct ipq *ipq)
 {
-	inet_frag_kill(&ipq->q, &ip4_frags);
+	inet_frag_kill(&ipq->q);
 }
 
 static bool frag_expire_skip_icmp(u32 user)
@@ -872,20 +872,21 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
 	net->ipv4.frags.max_dist = 64;
+	net->ipv4.frags.f = &ip4_frags;
 
 	res = inet_frags_init_net(&net->ipv4.frags);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+		inet_frags_exit_net(&net->ipv4.frags);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	inet_frags_exit_net(&net->ipv4.frags);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6ff41569134a..c4b40fdee838 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -178,7 +178,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
 	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, nf_frag.frags);
 
-	ip6_expire_frag_queue(net, fq, &nf_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 /* Creation primitives. */
@@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
-			inet_frag_kill(&fq->q, &nf_frags);
+			inet_frag_kill(&fq->q);
 			return -EPROTO;
 		}
 		if (end > fq->q.len) {
@@ -357,7 +357,7 @@ found:
 	return 0;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 err:
 	return -EINVAL;
 }
@@ -379,7 +379,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 	int    payload_len;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 
 	WARN_ON(head == NULL);
 	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
@@ -622,7 +622,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 out_unlock:
 	spin_unlock_bh(&fq->q.lock);
-	inet_frag_put(&fq->q, &nf_frags);
+	inet_frag_put(&fq->q);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
@@ -634,19 +634,21 @@ static int nf_ct_net_init(struct net *net)
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->nf_frag.frags.f = &nf_frags;
+
 	res = inet_frags_init_net(&net->nf_frag.frags);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+		inet_frags_exit_net(&net->nf_frag.frags);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	inet_frags_exit_net(&net->nf_frag.frags);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index a8f7a5f0251a..4855de6f673a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -128,8 +128,7 @@ void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 }
 EXPORT_SYMBOL(ip6_frag_init);
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags)
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
 	struct net_device *dev = NULL;
 
@@ -138,7 +137,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, frags);
+	inet_frag_kill(&fq->q);
 
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, fq->iif);
@@ -166,7 +165,7 @@ out_rcu_unlock:
 	rcu_read_unlock();
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, frags);
+	inet_frag_put(&fq->q);
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);
 
@@ -179,7 +178,7 @@ static void ip6_frag_expire(struct timer_list *t)
 	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
 
-	ip6_expire_frag_queue(net, fq, &ip6_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 static struct frag_queue *
@@ -364,7 +363,7 @@ found:
 	return -1;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 err:
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			IPSTATS_MIB_REASMFAILS);
@@ -391,7 +390,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	int sum_truesize;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
@@ -569,7 +568,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
-		inet_frag_put(&fq->q, &ip6_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -716,6 +715,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->ipv6.frags.f = &ip6_frags;
 
 	res = inet_frags_init_net(&net->ipv6.frags);
 	if (res < 0)
@@ -723,14 +723,14 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+		inet_frags_exit_net(&net->ipv6.frags);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	inet_frags_exit_net(&net->ipv6.frags);
 }
 
 static struct pernet_operations ip6_frags_ops = {
-- 
cgit v1.2.3


From 5b975bab23615cd0fdf67af6c9298eb01c4b9f61 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:45 -0700
Subject: inet: frags: refactor ipv6_frag_init()

We want to call inet_frags_init() earlier.

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 4855de6f673a..f0071b113a92 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -742,10 +742,21 @@ int __init ipv6_frag_init(void)
 {
 	int ret;
 
-	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	ip6_frags.hashfn = ip6_hashfn;
+	ip6_frags.constructor = ip6_frag_init;
+	ip6_frags.destructor = NULL;
+	ip6_frags.qsize = sizeof(struct frag_queue);
+	ip6_frags.match = ip6_frag_match;
+	ip6_frags.frag_expire = ip6_frag_expire;
+	ip6_frags.frags_cache_name = ip6_frag_cache_name;
+	ret = inet_frags_init(&ip6_frags);
 	if (ret)
 		goto out;
 
+	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	if (ret)
+		goto err_protocol;
+
 	ret = ip6_frags_sysctl_register();
 	if (ret)
 		goto err_sysctl;
@@ -754,16 +765,6 @@ int __init ipv6_frag_init(void)
 	if (ret)
 		goto err_pernet;
 
-	ip6_frags.hashfn = ip6_hashfn;
-	ip6_frags.constructor = ip6_frag_init;
-	ip6_frags.destructor = NULL;
-	ip6_frags.qsize = sizeof(struct frag_queue);
-	ip6_frags.match = ip6_frag_match;
-	ip6_frags.frag_expire = ip6_frag_expire;
-	ip6_frags.frags_cache_name = ip6_frag_cache_name;
-	ret = inet_frags_init(&ip6_frags);
-	if (ret)
-		goto err_pernet;
 out:
 	return ret;
 
@@ -771,6 +772,8 @@ err_pernet:
 	ip6_frags_sysctl_unregister();
 err_sysctl:
 	inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+err_protocol:
+	inet_frags_fini(&ip6_frags);
 	goto out;
 }
 
-- 
cgit v1.2.3


From 807f1844df4ac23594268fa9f41902d0549e92aa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:46 -0700
Subject: inet: frags: refactor lowpan_net_frag_init()

We want to call lowpan_net_frag_init() earlier.
Similar to commit "inet: frags: refactor ipv6_frag_init()"

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan/reassembly.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 6badc055555b..ddada12a044d 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -615,14 +615,6 @@ int __init lowpan_net_frag_init(void)
 {
 	int ret;
 
-	ret = lowpan_frags_sysctl_register();
-	if (ret)
-		return ret;
-
-	ret = register_pernet_subsys(&lowpan_frags_ops);
-	if (ret)
-		goto err_pernet;
-
 	lowpan_frags.hashfn = lowpan_hashfn;
 	lowpan_frags.constructor = lowpan_frag_init;
 	lowpan_frags.destructor = NULL;
@@ -632,11 +624,21 @@ int __init lowpan_net_frag_init(void)
 	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
 	ret = inet_frags_init(&lowpan_frags);
 	if (ret)
-		goto err_pernet;
+		goto out;
 
+	ret = lowpan_frags_sysctl_register();
+	if (ret)
+		goto err_sysctl;
+
+	ret = register_pernet_subsys(&lowpan_frags_ops);
+	if (ret)
+		goto err_pernet;
+out:
 	return ret;
 err_pernet:
 	lowpan_frags_sysctl_unregister();
+err_sysctl:
+	inet_frags_fini(&lowpan_frags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 483a6e4fa055123142d8956866fe2aa9c98d546d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:47 -0700
Subject: inet: frags: refactor ipfrag_init()

We need to call inet_frags_init() before register_pernet_subsys(),
as a prereq for following patch ("inet: frags: use rhashtables for reassembly units")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cd2b4c9419fc..1a3bc85d6f5e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -896,8 +896,6 @@ static struct pernet_operations ip4_frags_ops = {
 
 void __init ipfrag_init(void)
 {
-	ip4_frags_ctl_register();
-	register_pernet_subsys(&ip4_frags_ops);
 	ip4_frags.hashfn = ip4_hashfn;
 	ip4_frags.constructor = ip4_frag_init;
 	ip4_frags.destructor = ip4_frag_free;
@@ -907,4 +905,6 @@ void __init ipfrag_init(void)
 	ip4_frags.frags_cache_name = ip_frag_cache_name;
 	if (inet_frags_init(&ip4_frags))
 		panic("IP: failed to allocate ip4_frags cache\n");
+	ip4_frags_ctl_register();
+	register_pernet_subsys(&ip4_frags_ops);
 }
-- 
cgit v1.2.3


From 648700f76b03b7e8149d13cc2bdb3355035258a9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:49 -0700
Subject: inet: frags: use rhashtables for reassembly units

Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.

Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.

After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB
of storage for the fragments (exact number depends on frags being evicted
after timeout)

$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608

A followup patch will change the limits for 64bit arches.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Florian Westphal <fw@strlen.de>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Alexander Aring <alex.aring@gmail.com>
Cc: Stefan Schmidt <stefan@osg.samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan/6lowpan_i.h      |  26 +--
 net/ieee802154/6lowpan/reassembly.c     |  91 ++++-----
 net/ipv4/inet_fragment.c                | 344 ++++++--------------------------
 net/ipv4/ip_fragment.c                  | 112 +++++------
 net/ipv6/netfilter/nf_conntrack_reasm.c |  51 ++---
 net/ipv6/reassembly.c                   | 110 +++++-----
 6 files changed, 225 insertions(+), 509 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index d8de3bcfb103..b8d95cb71c25 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
 #define LOWPAN_DISPATCH_FRAG1           0xc0
 #define LOWPAN_DISPATCH_FRAGN           0xe0
 
-struct lowpan_create_arg {
+struct frag_lowpan_compare_key {
 	u16 tag;
 	u16 d_size;
-	const struct ieee802154_addr *src;
-	const struct ieee802154_addr *dst;
+	const struct ieee802154_addr src;
+	const struct ieee802154_addr dst;
 };
 
-/* Equivalent of ipv4 struct ip
+/* Equivalent of ipv4 struct ipq
  */
 struct lowpan_frag_queue {
 	struct inet_frag_queue	q;
-
-	u16			tag;
-	u16			d_size;
-	struct ieee802154_addr	saddr;
-	struct ieee802154_addr	daddr;
 };
 
-static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
-{
-	switch (a->mode) {
-	case IEEE802154_ADDR_LONG:
-		return (((__force u64)a->extended_addr) >> 32) ^
-			(((__force u64)a->extended_addr) & 0xffffffff);
-	case IEEE802154_ADDR_SHORT:
-		return (__force u32)(a->short_addr + (a->pan_id << 16));
-	default:
-		return 0;
-	}
-}
-
 int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
 void lowpan_net_frag_exit(void);
 int lowpan_net_frag_init(void);
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index ddada12a044d..0fa0121f85d4 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
 static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
 			     struct sk_buff *prev, struct net_device *ldev);
 
-static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
-				     const struct ieee802154_addr *saddr,
-				     const struct ieee802154_addr *daddr)
-{
-	net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
-	return jhash_3words(ieee802154_addr_hash(saddr),
-			    ieee802154_addr_hash(daddr),
-			    (__force u32)(tag + (d_size << 16)),
-			    lowpan_frags.rnd);
-}
-
-static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
-{
-	const struct lowpan_frag_queue *fq;
-
-	fq = container_of(q, struct lowpan_frag_queue, q);
-	return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
-}
-
-static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct lowpan_frag_queue *fq;
-	const struct lowpan_create_arg *arg = a;
-
-	fq = container_of(q, struct lowpan_frag_queue, q);
-	return	fq->tag == arg->tag && fq->d_size == arg->d_size &&
-		ieee802154_addr_equal(&fq->saddr, arg->src) &&
-		ieee802154_addr_equal(&fq->daddr, arg->dst);
-}
-
 static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
 {
-	const struct lowpan_create_arg *arg = a;
+	const struct frag_lowpan_compare_key *key = a;
 	struct lowpan_frag_queue *fq;
 
 	fq = container_of(q, struct lowpan_frag_queue, q);
 
-	fq->tag = arg->tag;
-	fq->d_size = arg->d_size;
-	fq->saddr = *arg->src;
-	fq->daddr = *arg->dst;
+	BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
+	memcpy(&q->key, key, sizeof(*key));
 }
 
 static void lowpan_frag_expire(struct timer_list *t)
@@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	const struct ieee802154_addr *src,
 	const struct ieee802154_addr *dst)
 {
-	struct inet_frag_queue *q;
-	struct lowpan_create_arg arg;
-	unsigned int hash;
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
+	struct frag_lowpan_compare_key key = {
+		.tag = cb->d_tag,
+		.d_size = cb->d_size,
+		.src = *src,
+		.dst = *dst,
+	};
+	struct inet_frag_queue *q;
 
-	arg.tag = cb->d_tag;
-	arg.d_size = cb->d_size;
-	arg.src = src;
-	arg.dst = dst;
-
-	hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
-
-	q = inet_frag_find(&ieee802154_lowpan->frags,
-			   &lowpan_frags, &arg, hash);
+	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -611,17 +575,46 @@ static struct pernet_operations lowpan_frags_ops = {
 	.exit = lowpan_frags_exit_net,
 };
 
+static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key,
+		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_lowpan_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params lowpan_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.hashfn			= lowpan_key_hashfn,
+	.obj_hashfn		= lowpan_obj_hashfn,
+	.obj_cmpfn		= lowpan_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
 int __init lowpan_net_frag_init(void)
 {
 	int ret;
 
-	lowpan_frags.hashfn = lowpan_hashfn;
 	lowpan_frags.constructor = lowpan_frag_init;
 	lowpan_frags.destructor = NULL;
 	lowpan_frags.qsize = sizeof(struct frag_queue);
-	lowpan_frags.match = lowpan_frag_match;
 	lowpan_frags.frag_expire = lowpan_frag_expire;
 	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
+	lowpan_frags.rhash_params = lowpan_rhash_params;
 	ret = inet_frags_init(&lowpan_frags);
 	if (ret)
 		goto out;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 1ac69f65d0de..ebb8f411e0db 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,12 +25,6 @@
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
-#define INETFRAGS_EVICT_BUCKETS   128
-#define INETFRAGS_EVICT_MAX	  512
-
-/* don't rebuild inetfrag table with new secret more often than this */
-#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
-
 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  * Value : 0xff if frame should be dropped.
  *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
 };
 EXPORT_SYMBOL(ip_frag_ecn_table);
 
-static unsigned int
-inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
-{
-	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
-}
-
-static bool inet_frag_may_rebuild(struct inet_frags *f)
-{
-	return time_after(jiffies,
-	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
-}
-
-static void inet_frag_secret_rebuild(struct inet_frags *f)
-{
-	int i;
-
-	write_seqlock_bh(&f->rnd_seqlock);
-
-	if (!inet_frag_may_rebuild(f))
-		goto out;
-
-	get_random_bytes(&f->rnd, sizeof(u32));
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb;
-		struct inet_frag_queue *q;
-		struct hlist_node *n;
-
-		hb = &f->hash[i];
-		spin_lock(&hb->chain_lock);
-
-		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
-			unsigned int hval = inet_frag_hashfn(f, q);
-
-			if (hval != i) {
-				struct inet_frag_bucket *hb_dest;
-
-				hlist_del(&q->list);
-
-				/* Relink to new hash chain. */
-				hb_dest = &f->hash[hval];
-
-				/* This is the only place where we take
-				 * another chain_lock while already holding
-				 * one.  As this will not run concurrently,
-				 * we cannot deadlock on hb_dest lock below, if its
-				 * already locked it will be released soon since
-				 * other caller cannot be waiting for hb lock
-				 * that we've taken above.
-				 */
-				spin_lock_nested(&hb_dest->chain_lock,
-						 SINGLE_DEPTH_NESTING);
-				hlist_add_head(&q->list, &hb_dest->chain);
-				spin_unlock(&hb_dest->chain_lock);
-			}
-		}
-		spin_unlock(&hb->chain_lock);
-	}
-
-	f->rebuild = false;
-	f->last_rebuild_jiffies = jiffies;
-out:
-	write_sequnlock_bh(&f->rnd_seqlock);
-}
-
-static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
-{
-	if (!hlist_unhashed(&q->list_evictor))
-		return false;
-
-	return q->net->low_thresh == 0 ||
-	       frag_mem_limit(q->net) >= q->net->low_thresh;
-}
-
-static unsigned int
-inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
-{
-	struct inet_frag_queue *fq;
-	struct hlist_node *n;
-	unsigned int evicted = 0;
-	HLIST_HEAD(expired);
-
-	spin_lock(&hb->chain_lock);
-
-	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
-		if (!inet_fragq_should_evict(fq))
-			continue;
-
-		if (!del_timer(&fq->timer))
-			continue;
-
-		hlist_add_head(&fq->list_evictor, &expired);
-		++evicted;
-	}
-
-	spin_unlock(&hb->chain_lock);
-
-	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-		f->frag_expire(&fq->timer);
-
-	return evicted;
-}
-
-static void inet_frag_worker(struct work_struct *work)
-{
-	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
-	unsigned int i, evicted = 0;
-	struct inet_frags *f;
-
-	f = container_of(work, struct inet_frags, frags_work);
-
-	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
-
-	local_bh_disable();
-
-	for (i = READ_ONCE(f->next_bucket); budget; --budget) {
-		evicted += inet_evict_bucket(f, &f->hash[i]);
-		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
-		if (evicted > INETFRAGS_EVICT_MAX)
-			break;
-	}
-
-	f->next_bucket = i;
-
-	local_bh_enable();
-
-	if (f->rebuild && inet_frag_may_rebuild(f))
-		inet_frag_secret_rebuild(f);
-}
-
-static void inet_frag_schedule_worker(struct inet_frags *f)
-{
-	if (unlikely(!work_pending(&f->frags_work)))
-		schedule_work(&f->frags_work);
-}
-
 int inet_frags_init(struct inet_frags *f)
 {
-	int i;
-
-	INIT_WORK(&f->frags_work, inet_frag_worker);
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb = &f->hash[i];
-
-		spin_lock_init(&hb->chain_lock);
-		INIT_HLIST_HEAD(&hb->chain);
-	}
-
-	seqlock_init(&f->rnd_seqlock);
-	f->last_rebuild_jiffies = 0;
 	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 					    NULL);
 	if (!f->frags_cachep)
@@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init);
 
 void inet_frags_fini(struct inet_frags *f)
 {
-	cancel_work_sync(&f->frags_work);
+	/* We must wait that all inet_frag_destroy_rcu() have completed. */
+	rcu_barrier();
+
 	kmem_cache_destroy(f->frags_cachep);
+	f->frags_cachep = NULL;
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf)
-{
-	struct inet_frags *f =nf->f;
-	unsigned int seq;
-	int i;
-
-	nf->low_thresh = 0;
-
-evict_again:
-	local_bh_disable();
-	seq = read_seqbegin(&f->rnd_seqlock);
-
-	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
-		inet_evict_bucket(f, &f->hash[i]);
-
-	local_bh_enable();
-	cond_resched();
-
-	if (read_seqretry(&f->rnd_seqlock, seq) ||
-	    sum_frag_mem_limit(nf))
-		goto evict_again;
-}
-EXPORT_SYMBOL(inet_frags_exit_net);
-
-static struct inet_frag_bucket *
-get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
-__acquires(hb->chain_lock)
+static void inet_frags_free_cb(void *ptr, void *arg)
 {
-	struct inet_frag_bucket *hb;
-	unsigned int seq, hash;
+	struct inet_frag_queue *fq = ptr;
 
- restart:
-	seq = read_seqbegin(&f->rnd_seqlock);
-
-	hash = inet_frag_hashfn(f, fq);
-	hb = &f->hash[hash];
+	/* If we can not cancel the timer, it means this frag_queue
+	 * is already disappearing, we have nothing to do.
+	 * Otherwise, we own a refcount until the end of this function.
+	 */
+	if (!del_timer(&fq->timer))
+		return;
 
-	spin_lock(&hb->chain_lock);
-	if (read_seqretry(&f->rnd_seqlock, seq)) {
-		spin_unlock(&hb->chain_lock);
-		goto restart;
+	spin_lock_bh(&fq->lock);
+	if (!(fq->flags & INET_FRAG_COMPLETE)) {
+		fq->flags |= INET_FRAG_COMPLETE;
+		refcount_dec(&fq->refcnt);
 	}
+	spin_unlock_bh(&fq->lock);
 
-	return hb;
+	inet_frag_put(fq);
 }
 
-static inline void fq_unlink(struct inet_frag_queue *fq)
+void inet_frags_exit_net(struct netns_frags *nf)
 {
-	struct inet_frag_bucket *hb;
+	nf->low_thresh = 0; /* prevent creation of new frags */
 
-	hb = get_frag_bucket_locked(fq, fq->net->f);
-	hlist_del(&fq->list);
-	fq->flags |= INET_FRAG_COMPLETE;
-	spin_unlock(&hb->chain_lock);
+	rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
 }
+EXPORT_SYMBOL(inet_frags_exit_net);
 
 void inet_frag_kill(struct inet_frag_queue *fq)
 {
@@ -281,12 +102,26 @@ void inet_frag_kill(struct inet_frag_queue *fq)
 		refcount_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq);
+		struct netns_frags *nf = fq->net;
+
+		fq->flags |= INET_FRAG_COMPLETE;
+		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
 		refcount_dec(&fq->refcnt);
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
 
+static void inet_frag_destroy_rcu(struct rcu_head *head)
+{
+	struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
+						 rcu);
+	struct inet_frags *f = q->net->f;
+
+	if (f->destructor)
+		f->destructor(q);
+	kmem_cache_free(f->frags_cachep, q);
+}
+
 void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct sk_buff *fp;
@@ -310,59 +145,20 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 	}
 	sum = sum_truesize + f->qsize;
 
-	if (f->destructor)
-		f->destructor(q);
-	kmem_cache_free(f->frags_cachep, q);
+	call_rcu(&q->rcu, inet_frag_destroy_rcu);
 
 	sub_frag_mem_limit(nf, sum);
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
-						struct inet_frag_queue *qp_in,
-						struct inet_frags *f,
-						void *arg)
-{
-	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
-	struct inet_frag_queue *qp;
-
-#ifdef CONFIG_SMP
-	/* With SMP race we have to recheck hash table, because
-	 * such entry could have been created on other cpu before
-	 * we acquired hash bucket lock.
-	 */
-	hlist_for_each_entry(qp, &hb->chain, list) {
-		if (qp->net == nf && f->match(qp, arg)) {
-			refcount_inc(&qp->refcnt);
-			spin_unlock(&hb->chain_lock);
-			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in);
-			return qp;
-		}
-	}
-#endif
-	qp = qp_in;
-	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
-		refcount_inc(&qp->refcnt);
-
-	refcount_inc(&qp->refcnt);
-	hlist_add_head(&qp->list, &hb->chain);
-
-	spin_unlock(&hb->chain_lock);
-
-	return qp;
-}
-
 static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 					       struct inet_frags *f,
 					       void *arg)
 {
 	struct inet_frag_queue *q;
 
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
-		inet_frag_schedule_worker(f);
+	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
 		return NULL;
-	}
 
 	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
 	if (!q)
@@ -374,59 +170,52 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 
 	timer_setup(&q->timer, f->frag_expire, 0);
 	spin_lock_init(&q->lock);
-	refcount_set(&q->refcnt, 1);
+	refcount_set(&q->refcnt, 3);
 
 	return q;
 }
 
 static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-						struct inet_frags *f,
 						void *arg)
 {
+	struct inet_frags *f = nf->f;
 	struct inet_frag_queue *q;
+	int err;
 
 	q = inet_frag_alloc(nf, f, arg);
 	if (!q)
 		return NULL;
 
-	return inet_frag_intern(nf, q, f, arg);
+	mod_timer(&q->timer, jiffies + nf->timeout);
+
+	err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
+				     f->rhash_params);
+	if (err < 0) {
+		q->flags |= INET_FRAG_COMPLETE;
+		inet_frag_kill(q);
+		inet_frag_destroy(q);
+		return NULL;
+	}
+	return q;
 }
 
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-				       struct inet_frags *f, void *key,
-				       unsigned int hash)
+/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 {
-	struct inet_frag_bucket *hb;
-	struct inet_frag_queue *q;
-	int depth = 0;
-
-	if (frag_mem_limit(nf) > nf->low_thresh)
-		inet_frag_schedule_worker(f);
-
-	hash &= (INETFRAGS_HASHSZ - 1);
-	hb = &f->hash[hash];
-
-	spin_lock(&hb->chain_lock);
-	hlist_for_each_entry(q, &hb->chain, list) {
-		if (q->net == nf && f->match(q, key)) {
-			refcount_inc(&q->refcnt);
-			spin_unlock(&hb->chain_lock);
-			return q;
-		}
-		depth++;
-	}
-	spin_unlock(&hb->chain_lock);
+	struct inet_frag_queue *fq;
 
-	if (depth <= INETFRAGS_MAXDEPTH)
-		return inet_frag_create(nf, f, key);
+	rcu_read_lock();
 
-	if (inet_frag_may_rebuild(f)) {
-		if (!f->rebuild)
-			f->rebuild = true;
-		inet_frag_schedule_worker(f);
+	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+	if (fq) {
+		if (!refcount_inc_not_zero(&fq->refcnt))
+			fq = NULL;
+		rcu_read_unlock();
+		return fq;
 	}
+	rcu_read_unlock();
 
-	return ERR_PTR(-ENOBUFS);
+	return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
 
@@ -434,8 +223,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix)
 {
 	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
-		". Dropping fragment.\n";
+		" list length grew over limit. Dropping fragment.\n";
 
 	if (PTR_ERR(q) == -ENOBUFS)
 		net_dbg_ratelimited("%s%s", prefix, msg);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1a3bc85d6f5e..4021820db6f2 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -69,15 +69,9 @@ struct ipfrag_skb_cb
 struct ipq {
 	struct inet_frag_queue q;
 
-	u32		user;
-	__be32		saddr;
-	__be32		daddr;
-	__be16		id;
-	u8		protocol;
 	u8		ecn; /* RFC3168 support */
 	u16		max_df_size; /* largest frag with DF set seen */
 	int             iif;
-	int             vif;   /* L3 master device index */
 	unsigned int    rid;
 	struct inet_peer *peer;
 };
@@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net)
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev);
 
-struct ip4_create_arg {
-	struct iphdr *iph;
-	u32 user;
-	int vif;
-};
-
-static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
-{
-	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
-	return jhash_3words((__force u32)id << 16 | prot,
-			    (__force u32)saddr, (__force u32)daddr,
-			    ip4_frags.rnd);
-}
-
-static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
-{
-	const struct ipq *ipq;
-
-	ipq = container_of(q, struct ipq, q);
-	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
-}
-
-static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct ipq *qp;
-	const struct ip4_create_arg *arg = a;
-
-	qp = container_of(q, struct ipq, q);
-	return	qp->id == arg->iph->id &&
-		qp->saddr == arg->iph->saddr &&
-		qp->daddr == arg->iph->daddr &&
-		qp->protocol == arg->iph->protocol &&
-		qp->user == arg->user &&
-		qp->vif == arg->vif;
-}
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 {
@@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 					       frags);
 	struct net *net = container_of(ipv4, struct net, ipv4);
 
-	const struct ip4_create_arg *arg = a;
+	const struct frag_v4_compare_key *key = a;
 
-	qp->protocol = arg->iph->protocol;
-	qp->id = arg->iph->id;
-	qp->ecn = ip4_frag_ecn(arg->iph->tos);
-	qp->saddr = arg->iph->saddr;
-	qp->daddr = arg->iph->daddr;
-	qp->vif = arg->vif;
-	qp->user = arg->user;
+	q->key.v4 = *key;
+	qp->ecn = 0;
 	qp->peer = q->net->max_dist ?
-		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
+		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
 		NULL;
 }
 
@@ -234,7 +188,7 @@ static void ip_expire(struct timer_list *t)
 		/* Only an end host needs to send an ICMP
 		 * "Fragment Reassembly Timeout" message, per RFC792.
 		 */
-		if (frag_expire_skip_icmp(qp->user) &&
+		if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
 		    (skb_rtable(head)->rt_type != RTN_LOCAL))
 			goto out;
 
@@ -262,17 +216,17 @@ out_rcu_unlock:
 static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 			   u32 user, int vif)
 {
+	struct frag_v4_compare_key key = {
+		.saddr = iph->saddr,
+		.daddr = iph->daddr,
+		.user = user,
+		.vif = vif,
+		.id = iph->id,
+		.protocol = iph->protocol,
+	};
 	struct inet_frag_queue *q;
-	struct ip4_create_arg arg;
-	unsigned int hash;
-
-	arg.iph = iph;
-	arg.user = user;
-	arg.vif = vif;
 
-	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
-
-	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
+	q = inet_frag_find(&net->ipv4.frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -656,7 +610,7 @@ out_nomem:
 	err = -ENOMEM;
 	goto out_fail;
 out_oversize:
-	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
+	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
 out_fail:
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	return err;
@@ -894,15 +848,47 @@ static struct pernet_operations ip4_frags_ops = {
 	.exit = ipv4_frags_exit_net,
 };
 
+
+static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v4,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v4_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params ip4_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.key_offset		= offsetof(struct inet_frag_queue, key),
+	.key_len		= sizeof(struct frag_v4_compare_key),
+	.hashfn			= ip4_key_hashfn,
+	.obj_hashfn		= ip4_obj_hashfn,
+	.obj_cmpfn		= ip4_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
 void __init ipfrag_init(void)
 {
-	ip4_frags.hashfn = ip4_hashfn;
 	ip4_frags.constructor = ip4_frag_init;
 	ip4_frags.destructor = ip4_frag_free;
 	ip4_frags.qsize = sizeof(struct ipq);
-	ip4_frags.match = ip4_frag_match;
 	ip4_frags.frag_expire = ip_expire;
 	ip4_frags.frags_cache_name = ip_frag_cache_name;
+	ip4_frags.rhash_params = ip4_rhash_params;
 	if (inet_frags_init(&ip4_frags))
 		panic("IP: failed to allocate ip4_frags cache\n");
 	ip4_frags_ctl_register();
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c4b40fdee838..0ad3df551d98 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
 }
 
-static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
-				 const struct in6_addr *daddr)
-{
-	net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
-	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			    (__force u32)id, nf_frags.rnd);
-}
-
-
-static unsigned int nf_hashfn(const struct inet_frag_queue *q)
-{
-	const struct frag_queue *nq;
-
-	nq = container_of(q, struct frag_queue, q);
-	return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
-}
-
 static void nf_ct_frag6_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
@@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct timer_list *t)
 }
 
 /* Creation primitives. */
-static inline struct frag_queue *fq_find(struct net *net, __be32 id,
-					 u32 user, struct in6_addr *src,
-					 struct in6_addr *dst, int iif, u8 ecn)
+static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
+				  const struct ipv6hdr *hdr, int iif)
 {
+	struct frag_v6_compare_key key = {
+		.id = id,
+		.saddr = hdr->saddr,
+		.daddr = hdr->daddr,
+		.user = user,
+		.iif = iif,
+	};
 	struct inet_frag_queue *q;
-	struct ip6_create_arg arg;
-	unsigned int hash;
-
-	arg.id = id;
-	arg.user = user;
-	arg.src = src;
-	arg.dst = dst;
-	arg.iif = iif;
-	arg.ecn = ecn;
-
-	local_bh_disable();
-	hash = nf_hash_frag(id, src, dst);
 
-	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
-	local_bh_enable();
+	q = inet_frag_find(&net->nf_frag.frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	fhdr = (struct frag_hdr *)skb_transport_header(skb);
 
 	skb_orphan(skb);
-	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
-		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+	fq = fq_find(net, fhdr->identification, user, hdr,
+		     skb->dev ? skb->dev->ifindex : 0);
 	if (fq == NULL) {
 		pr_debug("Can't find and can't create new queue\n");
 		return -ENOMEM;
@@ -660,13 +636,12 @@ int nf_ct_frag6_init(void)
 {
 	int ret = 0;
 
-	nf_frags.hashfn = nf_hashfn;
 	nf_frags.constructor = ip6_frag_init;
 	nf_frags.destructor = NULL;
 	nf_frags.qsize = sizeof(struct frag_queue);
-	nf_frags.match = ip6_frag_match;
 	nf_frags.frag_expire = nf_ct_frag6_expire;
 	nf_frags.frags_cache_name = nf_frags_cache_name;
+	nf_frags.rhash_params = ip6_rhash_params;
 	ret = inet_frags_init(&nf_frags);
 	if (ret)
 		goto out;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index f0071b113a92..3fc853e4492a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -79,52 +79,13 @@ static struct inet_frags ip6_frags;
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 			  struct net_device *dev);
 
-/*
- * callers should be careful not to use the hash value outside the ipfrag_lock
- * as doing so could race with ipfrag_hash_rnd being recalculated.
- */
-static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
-				    const struct in6_addr *daddr)
-{
-	net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
-	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			    (__force u32)id, ip6_frags.rnd);
-}
-
-static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
-{
-	const struct frag_queue *fq;
-
-	fq = container_of(q, struct frag_queue, q);
-	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
-}
-
-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct frag_queue *fq;
-	const struct ip6_create_arg *arg = a;
-
-	fq = container_of(q, struct frag_queue, q);
-	return	fq->id == arg->id &&
-		fq->user == arg->user &&
-		ipv6_addr_equal(&fq->saddr, arg->src) &&
-		ipv6_addr_equal(&fq->daddr, arg->dst) &&
-		(arg->iif == fq->iif ||
-		 !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
-					       IPV6_ADDR_LINKLOCAL)));
-}
-EXPORT_SYMBOL(ip6_frag_match);
-
 void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct frag_queue *fq = container_of(q, struct frag_queue, q);
-	const struct ip6_create_arg *arg = a;
+	const struct frag_v6_compare_key *key = a;
 
-	fq->id = arg->id;
-	fq->user = arg->user;
-	fq->saddr = *arg->src;
-	fq->daddr = *arg->dst;
-	fq->ecn = arg->ecn;
+	q->key.v6 = *key;
+	fq->ecn = 0;
 }
 EXPORT_SYMBOL(ip6_frag_init);
 
@@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer_list *t)
 }
 
 static struct frag_queue *
-fq_find(struct net *net, __be32 id, const struct in6_addr *src,
-	const struct in6_addr *dst, int iif, u8 ecn)
+fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 {
+	struct frag_v6_compare_key key = {
+		.id = id,
+		.saddr = hdr->saddr,
+		.daddr = hdr->daddr,
+		.user = IP6_DEFRAG_LOCAL_DELIVER,
+		.iif = iif,
+	};
 	struct inet_frag_queue *q;
-	struct ip6_create_arg arg;
-	unsigned int hash;
-
-	arg.id = id;
-	arg.user = IP6_DEFRAG_LOCAL_DELIVER;
-	arg.src = src;
-	arg.dst = dst;
-	arg.iif = iif;
-	arg.ecn = ecn;
 
-	hash = inet6_hash_frag(id, src, dst);
+	if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
+					    IPV6_ADDR_LINKLOCAL)))
+		key.iif = 0;
 
-	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
+	q = inet_frag_find(&net->ipv6.frags, &key);
 	if (IS_ERR_OR_NULL(q)) {
 		inet_frag_maybe_warn_overflow(q, pr_fmt());
 		return NULL;
@@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	struct frag_queue *fq;
 	const struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	int iif;
 
 	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
 		goto fail_hdr;
@@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
-		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+	iif = skb->dev ? skb->dev->ifindex : 0;
+	fq = fq_find(net, fhdr->identification, hdr, iif);
 	if (fq) {
 		int ret;
 
 		spin_lock(&fq->q.lock);
 
+		fq->iif = iif;
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
@@ -738,17 +700,47 @@ static struct pernet_operations ip6_frags_ops = {
 	.exit = ipv6_frags_exit_net,
 };
 
+static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v6,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v6_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+const struct rhashtable_params ip6_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.hashfn			= ip6_key_hashfn,
+	.obj_hashfn		= ip6_obj_hashfn,
+	.obj_cmpfn		= ip6_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+EXPORT_SYMBOL(ip6_rhash_params);
+
 int __init ipv6_frag_init(void)
 {
 	int ret;
 
-	ip6_frags.hashfn = ip6_hashfn;
 	ip6_frags.constructor = ip6_frag_init;
 	ip6_frags.destructor = NULL;
 	ip6_frags.qsize = sizeof(struct frag_queue);
-	ip6_frags.match = ip6_frag_match;
 	ip6_frags.frag_expire = ip6_frag_expire;
 	ip6_frags.frags_cache_name = ip6_frag_cache_name;
+	ip6_frags.rhash_params = ip6_rhash_params;
 	ret = inet_frags_init(&ip6_frags);
 	if (ret)
 		goto out;
-- 
cgit v1.2.3


From 6befe4a78b1553edb6eed3a78b4bcd9748526672 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:50 -0700
Subject: inet: frags: remove some helpers

Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem()

Also since we use rhashtable we can bring back the number of fragments
in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was
removed in commit 434d305405ab ("inet: frag: don't account number
of fragment queues")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 5 -----
 net/ipv4/proc.c        | 6 +++---
 net/ipv6/proc.c        | 5 +++--
 3 files changed, 6 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 4021820db6f2..44f4fa306e22 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -83,11 +83,6 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-int ip_frag_mem(struct net *net)
-{
-	return sum_frag_mem_limit(&net->ipv4.frags);
-}
-
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev);
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index adfb75340275..aacfce0d7d82 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,7 +54,6 @@
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
-	unsigned int frag_mem;
 	int orphans, sockets;
 
 	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
-	frag_mem = ip_frag_mem(net);
-	seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+	seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+		   atomic_read(&net->ipv4.frags.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv4.frags));
 	return 0;
 }
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 6e57028d2e91..8befeb91e071 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,6 @@
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
-	unsigned int frag_mem = ip6_frag_mem(net);
 
 	seq_printf(seq, "TCP6: inuse %d\n",
 		       sock_prot_inuse_get(net, &tcpv6_prot));
@@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 			sock_prot_inuse_get(net, &udplitev6_prot));
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
-	seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
+	seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+		   atomic_read(&net->ipv6.frags.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv6.frags));
 	return 0;
 }
 
-- 
cgit v1.2.3


From 399d1404be660d355192ff4df5ccc3f4159ec1e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:51 -0700
Subject: inet: frags: get rif of inet_frag_evicting()

This refactors ip_expire() since one indentation level is removed.

Note: in the future, we should try hard to avoid the skb_clone()
since this is a serious performance cost.
Under DDOS, the ICMP message wont be sent because of rate limits.

Fact that ip6_expire_frag_queue() does not use skb_clone() is
disturbing too. Presumably IPv6 should have the same
issue than the one we fixed in commit ec4fbd64751d
("inet: frag: release spinlock before calling icmp_send()")

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 65 +++++++++++++++++++++++++-------------------------
 net/ipv6/reassembly.c  |  4 ----
 2 files changed, 32 insertions(+), 37 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 44f4fa306e22..b844f517b75b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user)
 static void ip_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
-	struct ipq *qp;
+	struct sk_buff *clone, *head;
+	const struct iphdr *iph;
 	struct net *net;
+	struct ipq *qp;
+	int err;
 
 	qp = container_of(frag, struct ipq, q);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
@@ -158,45 +161,41 @@ static void ip_expire(struct timer_list *t)
 	ipq_kill(qp);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!inet_frag_evicting(&qp->q)) {
-		struct sk_buff *clone, *head = qp->q.fragments;
-		const struct iphdr *iph;
-		int err;
+	head = qp->q.fragments;
 
-		__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-		if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
-			goto out;
+	if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+		goto out;
 
-		head->dev = dev_get_by_index_rcu(net, qp->iif);
-		if (!head->dev)
-			goto out;
+	head->dev = dev_get_by_index_rcu(net, qp->iif);
+	if (!head->dev)
+		goto out;
 
 
-		/* skb has no dst, perform route lookup again */
-		iph = ip_hdr(head);
-		err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+	/* skb has no dst, perform route lookup again */
+	iph = ip_hdr(head);
+	err = ip_route_input_noref(head, iph->daddr, iph->saddr,
 					   iph->tos, head->dev);
-		if (err)
-			goto out;
+	if (err)
+		goto out;
 
-		/* Only an end host needs to send an ICMP
-		 * "Fragment Reassembly Timeout" message, per RFC792.
-		 */
-		if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
-		    (skb_rtable(head)->rt_type != RTN_LOCAL))
-			goto out;
-
-		clone = skb_clone(head, GFP_ATOMIC);
-
-		/* Send an ICMP "Fragment Reassembly Timeout" message. */
-		if (clone) {
-			spin_unlock(&qp->q.lock);
-			icmp_send(clone, ICMP_TIME_EXCEEDED,
-				  ICMP_EXC_FRAGTIME, 0);
-			consume_skb(clone);
-			goto out_rcu_unlock;
-		}
+	/* Only an end host needs to send an ICMP
+	 * "Fragment Reassembly Timeout" message, per RFC792.
+	 */
+	if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+	    (skb_rtable(head)->rt_type != RTN_LOCAL))
+		goto out;
+
+	clone = skb_clone(head, GFP_ATOMIC);
+
+	/* Send an ICMP "Fragment Reassembly Timeout" message. */
+	if (clone) {
+		spin_unlock(&qp->q.lock);
+		icmp_send(clone, ICMP_TIME_EXCEEDED,
+			  ICMP_EXC_FRAGTIME, 0);
+		consume_skb(clone);
+		goto out_rcu_unlock;
 	}
 out:
 	spin_unlock(&qp->q.lock);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3fc853e4492a..70acad126d04 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 		goto out_rcu_unlock;
 
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
-
-	if (inet_frag_evicting(&fq->q))
-		goto out_rcu_unlock;
-
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
 
 	/* Don't send error if the first segment did not arrive. */
-- 
cgit v1.2.3


From 2d44ed22e607f9a285b049de2263e3840673a260 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:52 -0700
Subject: inet: frags: remove inet_frag_maybe_warn_overflow()

This function is obsolete, after rhashtable addition to inet defrag.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan/reassembly.c     |  5 ++---
 net/ipv4/inet_fragment.c                | 11 -----------
 net/ipv4/ip_fragment.c                  |  5 ++---
 net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++---
 net/ipv6/reassembly.c                   |  5 ++---
 5 files changed, 8 insertions(+), 23 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 0fa0121f85d4..1aec71a3f904 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -84,10 +84,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct lowpan_frag_queue, q);
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index ebb8f411e0db..c9e35b81d093 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -218,14 +218,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 	return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix)
-{
-	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit. Dropping fragment.\n";
-
-	if (PTR_ERR(q) == -ENOBUFS)
-		net_dbg_ratelimited("%s%s", prefix, msg);
-}
-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b844f517b75b..b0366224f314 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->ipv4.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct ipq, q);
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 0ad3df551d98..d866412b8f6c 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -178,10 +178,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 	struct inet_frag_queue *q;
 
 	q = inet_frag_find(&net->nf_frag.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 70acad126d04..2a77fda5e3bc 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -155,10 +155,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 		key.iif = 0;
 
 	q = inet_frag_find(&net->ipv6.frags, &key);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
-- 
cgit v1.2.3


From 3e67f106f619dcfaf6f4e2039599bdb69848c714 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:53 -0700
Subject: inet: frags: break the 2GB limit for frags storage

Some users are willing to provision huge amounts of memory to be able
to perform reassembly reasonnably well under pressure.

Current memory tracking is using one atomic_t and integers.

Switch to atomic_long_t so that 64bit arches can use more than 2GB,
without any cost for 32bit arches.

Note that this patch avoids an overflow error, if high_thresh was set
to ~2GB, since this test in inet_frag_alloc() was never true :

if (... || frag_mem_limit(nf) > nf->high_thresh)

Tested:

$ echo 16000000000 >/proc/sys/net/ipv4/ipfrag_high_thresh

<frag DDOS>

$ grep FRAG /proc/net/sockstat
FRAG: inuse 14705885 memory 16000002880

$ nstat -n ; sleep 1 ; nstat | grep Reas
IpReasmReqds                    3317150            0.0
IpReasmFails                    3317112            0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ieee802154/6lowpan/reassembly.c     | 10 +++++-----
 net/ipv4/ip_fragment.c                  | 10 +++++-----
 net/ipv4/proc.c                         |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++-----
 net/ipv6/proc.c                         |  2 +-
 net/ipv6/reassembly.c                   |  6 +++---
 6 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'net')

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 1aec71a3f904..44f148a6bb57 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -411,23 +411,23 @@ err:
 }
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 	{
 		.procname	= "6lowpanfrag_high_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ieee802154_lowpan.frags.low_thresh
 	},
 	{
 		.procname	= "6lowpanfrag_low_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.ieee802154_lowpan.frags.high_thresh
 	},
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b0366224f314..053869f2c49b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -678,23 +678,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
 EXPORT_SYMBOL(ip_check_defrag);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table ip4_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ipfrag_high_thresh",
 		.data		= &init_net.ipv4.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ipv4.frags.low_thresh
 	},
 	{
 		.procname	= "ipfrag_low_thresh",
 		.data		= &init_net.ipv4.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.ipv4.frags.high_thresh
 	},
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index aacfce0d7d82..a058de677e94 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -71,7 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
-	seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+	seq_printf(seq,  "FRAG: inuse %u memory %lu\n",
 		   atomic_read(&net->ipv4.frags.rhashtable.nelems),
 		   frag_mem_limit(&net->ipv4.frags));
 	return 0;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index d866412b8f6c..603a39592859 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -63,7 +63,7 @@ struct nf_ct_frag6_skb_cb
 static struct inet_frags nf_frags;
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
@@ -76,18 +76,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_frag6_low_thresh",
 		.data		= &init_net.nf_frag.frags.low_thresh,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.nf_frag.frags.high_thresh
 	},
 	{
 		.procname	= "nf_conntrack_frag6_high_thresh",
 		.data		= &init_net.nf_frag.frags.high_thresh,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.nf_frag.frags.low_thresh
 	},
 	{ }
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 8befeb91e071..a85f7e0b14b1 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -47,7 +47,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 			sock_prot_inuse_get(net, &udplitev6_prot));
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
-	seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+	seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
 		   atomic_read(&net->ipv6.frags.rhashtable.nelems),
 		   frag_mem_limit(&net->ipv6.frags));
 	return 0;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 2a77fda5e3bc..905a8aee2671 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -552,15 +552,15 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ip6frag_high_thresh",
 		.data		= &init_net.ipv6.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ipv6.frags.low_thresh
 	},
 	{
 		.procname	= "ip6frag_low_thresh",
 		.data		= &init_net.ipv6.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
-- 
cgit v1.2.3


From 1eec5d5670084ee644597bd26c25e22c69b9f748 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:54 -0700
Subject: inet: frags: do not clone skb in ip_expire()

An skb_clone() was added in commit ec4fbd64751d ("inet: frag: release
spinlock before calling icmp_send()")

While fixing the bug at that time, it also added a very high cost
for DDOS frags, as the ICMP rate limit is applied after this
expensive operation (skb_clone() + consume_skb(), implying memory
allocations, copy, and freeing)

We can use skb_get(head) here, all we want is to make sure skb wont
be freed by another cpu.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 053869f2c49b..fb185d9a5cc7 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,8 @@ static bool frag_expire_skip_icmp(u32 user)
 static void ip_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
-	struct sk_buff *clone, *head;
 	const struct iphdr *iph;
+	struct sk_buff *head;
 	struct net *net;
 	struct ipq *qp;
 	int err;
@@ -187,16 +187,12 @@ static void ip_expire(struct timer_list *t)
 	    (skb_rtable(head)->rt_type != RTN_LOCAL))
 		goto out;
 
-	clone = skb_clone(head, GFP_ATOMIC);
+	skb_get(head);
+	spin_unlock(&qp->q.lock);
+	icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+	kfree_skb(head);
+	goto out_rcu_unlock;
 
-	/* Send an ICMP "Fragment Reassembly Timeout" message. */
-	if (clone) {
-		spin_unlock(&qp->q.lock);
-		icmp_send(clone, ICMP_TIME_EXCEEDED,
-			  ICMP_EXC_FRAGTIME, 0);
-		consume_skb(clone);
-		goto out_rcu_unlock;
-	}
 out:
 	spin_unlock(&qp->q.lock);
 out_rcu_unlock:
-- 
cgit v1.2.3


From 05c0b86b9696802fd0ce5676a92a63f1b455bdf3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:55 -0700
Subject: ipv6: frags: rewrite ip6_expire_frag_queue()

Make it similar to IPv4 ip_expire(), and release the lock
before calling icmp functions.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 905a8aee2671..2127da130dc2 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -92,7 +92,9 @@ EXPORT_SYMBOL(ip6_frag_init);
 void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
 	struct net_device *dev = NULL;
+	struct sk_buff *head;
 
+	rcu_read_lock();
 	spin_lock(&fq->q.lock);
 
 	if (fq->q.flags & INET_FRAG_COMPLETE)
@@ -100,28 +102,34 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 
 	inet_frag_kill(&fq->q);
 
-	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, fq->iif);
 	if (!dev)
-		goto out_rcu_unlock;
+		goto out;
 
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
 
 	/* Don't send error if the first segment did not arrive. */
-	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments)
-		goto out_rcu_unlock;
+	head = fq->q.fragments;
+	if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
+		goto out;
 
 	/* But use as source device on which LAST ARRIVED
 	 * segment was received. And do not use fq->dev
 	 * pointer directly, device might already disappeared.
 	 */
-	fq->q.fragments->dev = dev;
-	icmpv6_send(fq->q.fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
-out_rcu_unlock:
-	rcu_read_unlock();
+	head->dev = dev;
+	skb_get(head);
+	spin_unlock(&fq->q.lock);
+
+	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
+	kfree_skb(head);
+	goto out_rcu_unlock;
+
 out:
 	spin_unlock(&fq->q.lock);
+out_rcu_unlock:
+	rcu_read_unlock();
 	inet_frag_put(&fq->q);
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);
-- 
cgit v1.2.3


From bf66337140c64c27fa37222b7abca7e49d63fb57 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:58 -0700
Subject: inet: frags: get rid of ipfrag_skb_cb/FRAG_CB

ip_defrag uses skb->cb[] to store the fragment offset, and unfortunately
this integer is currently in a different cache line than skb->next,
meaning that we use two cache lines per skb when finding the insertion point.

By aliasing skb->ip_defrag_offset and skb->dev, we pack all the fields
in a single cache line and save precious memory bandwidth.

Note that after the fast path added by Changli Gao in commit
d6bebca92c66 ("fragment: add fast path for in-order fragments")
this change wont help the fast path, since we still need
to access prev->len (2nd cache line), but will show great
benefits when slow path is entered, since we perform
a linear scan of a potentially long list.

Also, note that this potential long list is an attack vector,
we might consider also using an rb-tree there eventually.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_fragment.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fb185d9a5cc7..994fa70a910f 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -57,14 +57,6 @@
  */
 static const char ip_frag_cache_name[] = "ip4-frags";
 
-struct ipfrag_skb_cb
-{
-	struct inet_skb_parm	h;
-	int			offset;
-};
-
-#define FRAG_CB(skb)	((struct ipfrag_skb_cb *)((skb)->cb))
-
 /* Describe an entry in the "incomplete datagrams" queue. */
 struct ipq {
 	struct inet_frag_queue q;
@@ -353,13 +345,13 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	 * this fragment, right?
 	 */
 	prev = qp->q.fragments_tail;
-	if (!prev || FRAG_CB(prev)->offset < offset) {
+	if (!prev || prev->ip_defrag_offset < offset) {
 		next = NULL;
 		goto found;
 	}
 	prev = NULL;
 	for (next = qp->q.fragments; next != NULL; next = next->next) {
-		if (FRAG_CB(next)->offset >= offset)
+		if (next->ip_defrag_offset >= offset)
 			break;	/* bingo! */
 		prev = next;
 	}
@@ -370,7 +362,7 @@ found:
 	 * any overlaps are eliminated.
 	 */
 	if (prev) {
-		int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+		int i = (prev->ip_defrag_offset + prev->len) - offset;
 
 		if (i > 0) {
 			offset += i;
@@ -387,8 +379,8 @@ found:
 
 	err = -ENOMEM;
 
-	while (next && FRAG_CB(next)->offset < end) {
-		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+	while (next && next->ip_defrag_offset < end) {
+		int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
 
 		if (i < next->len) {
 			/* Eat head of the next overlapped fragment
@@ -396,7 +388,7 @@ found:
 			 */
 			if (!pskb_pull(next, i))
 				goto err;
-			FRAG_CB(next)->offset += i;
+			next->ip_defrag_offset += i;
 			qp->q.meat -= i;
 			if (next->ip_summed != CHECKSUM_UNNECESSARY)
 				next->ip_summed = CHECKSUM_NONE;
@@ -420,7 +412,13 @@ found:
 		}
 	}
 
-	FRAG_CB(skb)->offset = offset;
+	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
+	dev = skb->dev;
+	if (dev)
+		qp->iif = dev->ifindex;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
+	skb->ip_defrag_offset = offset;
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
@@ -431,11 +429,6 @@ found:
 	else
 		qp->q.fragments = skb;
 
-	dev = skb->dev;
-	if (dev) {
-		qp->iif = dev->ifindex;
-		skb->dev = NULL;
-	}
 	qp->q.stamp = skb->tstamp;
 	qp->q.meat += skb->len;
 	qp->ecn |= ecn;
@@ -511,7 +504,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	}
 
 	WARN_ON(!head);
-	WARN_ON(FRAG_CB(head)->offset != 0);
+	WARN_ON(head->ip_defrag_offset != 0);
 
 	/* Allocate a new buffer for the datagram. */
 	ihlen = ip_hdrlen(head);
-- 
cgit v1.2.3


From 219badfaade986a2c3d99abd4eb6d83f4f9ed2fb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:58:59 -0700
Subject: ipv6: frags: get rid of ip6frag_skb_cb/FRAG6_CB

ip6_frag_queue uses skb->cb[] to store the fragment offset, meaning that
we could use two cache lines per skb when finding the insertion point,
if for some reason inet6_skb_parm size is increased in the future.

By using skb->ip_defrag_offset instead of skb->cb[], we pack all
the fields in a single cache line, matching what we did for IPv4.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 2127da130dc2..7b52efc63f6a 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -62,13 +62,6 @@
 
 static const char ip6_frag_cache_name[] = "ip6-frags";
 
-struct ip6frag_skb_cb {
-	struct inet6_skb_parm	h;
-	int			offset;
-};
-
-#define FRAG6_CB(skb)	((struct ip6frag_skb_cb *)((skb)->cb))
-
 static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 {
 	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
@@ -250,13 +243,13 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	 * this fragment, right?
 	 */
 	prev = fq->q.fragments_tail;
-	if (!prev || FRAG6_CB(prev)->offset < offset) {
+	if (!prev || prev->ip_defrag_offset < offset) {
 		next = NULL;
 		goto found;
 	}
 	prev = NULL;
 	for (next = fq->q.fragments; next != NULL; next = next->next) {
-		if (FRAG6_CB(next)->offset >= offset)
+		if (next->ip_defrag_offset >= offset)
 			break;	/* bingo! */
 		prev = next;
 	}
@@ -271,14 +264,20 @@ found:
 
 	/* Check for overlap with preceding fragment. */
 	if (prev &&
-	    (FRAG6_CB(prev)->offset + prev->len) > offset)
+	    (prev->ip_defrag_offset + prev->len) > offset)
 		goto discard_fq;
 
 	/* Look for overlap with succeeding segment. */
-	if (next && FRAG6_CB(next)->offset < end)
+	if (next && next->ip_defrag_offset < end)
 		goto discard_fq;
 
-	FRAG6_CB(skb)->offset = offset;
+	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
+	dev = skb->dev;
+	if (dev)
+		fq->iif = dev->ifindex;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
+	skb->ip_defrag_offset = offset;
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
@@ -289,11 +288,6 @@ found:
 	else
 		fq->q.fragments = skb;
 
-	dev = skb->dev;
-	if (dev) {
-		fq->iif = dev->ifindex;
-		skb->dev = NULL;
-	}
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
@@ -380,7 +374,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	}
 
 	WARN_ON(head == NULL);
-	WARN_ON(FRAG6_CB(head)->offset != 0);
+	WARN_ON(head->ip_defrag_offset != 0);
 
 	/* Unfragmented part is taken from the first segment. */
 	payload_len = ((head->data - skb_network_header(head)) -
-- 
cgit v1.2.3


From f2d1c724fca176ddbd32a9397df49221afbcb227 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 12:59:00 -0700
Subject: inet: frags: get rid of nf_ct_frag6_skb_cb/NFCT_FRAG6_CB

nf_ct_frag6_queue() uses skb->cb[] to store the fragment offset,
meaning that we could use two cache lines per skb when finding
the insertion point, if for some reason inet6_skb_parm size
is increased in the future.

By using skb->ip_defrag_offset instead of skb->cb[] we pack all the fields
in a single cache line, matching what we did for IPv4.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 603a39592859..3622aac343ae 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -52,14 +52,6 @@
 
 static const char nf_frags_cache_name[] = "nf-frags";
 
-struct nf_ct_frag6_skb_cb
-{
-	struct inet6_skb_parm	h;
-	int			offset;
-};
-
-#define NFCT_FRAG6_CB(skb)	((struct nf_ct_frag6_skb_cb *)((skb)->cb))
-
 static struct inet_frags nf_frags;
 
 #ifdef CONFIG_SYSCTL
@@ -270,13 +262,13 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	 * this fragment, right?
 	 */
 	prev = fq->q.fragments_tail;
-	if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) {
+	if (!prev || prev->ip_defrag_offset < offset) {
 		next = NULL;
 		goto found;
 	}
 	prev = NULL;
 	for (next = fq->q.fragments; next != NULL; next = next->next) {
-		if (NFCT_FRAG6_CB(next)->offset >= offset)
+		if (next->ip_defrag_offset >= offset)
 			break;	/* bingo! */
 		prev = next;
 	}
@@ -292,14 +284,19 @@ found:
 
 	/* Check for overlap with preceding fragment. */
 	if (prev &&
-	    (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset)
+	    (prev->ip_defrag_offset + prev->len) > offset)
 		goto discard_fq;
 
 	/* Look for overlap with succeeding segment. */
-	if (next && NFCT_FRAG6_CB(next)->offset < end)
+	if (next && next->ip_defrag_offset < end)
 		goto discard_fq;
 
-	NFCT_FRAG6_CB(skb)->offset = offset;
+	/* Note : skb->ip_defrag_offset and skb->dev share the same location */
+	if (skb->dev)
+		fq->iif = skb->dev->ifindex;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
+	skb->ip_defrag_offset = offset;
 
 	/* Insert this fragment in the chain of fragments. */
 	skb->next = next;
@@ -310,10 +307,6 @@ found:
 	else
 		fq->q.fragments = skb;
 
-	if (skb->dev) {
-		fq->iif = skb->dev->ifindex;
-		skb->dev = NULL;
-	}
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
 	fq->ecn |= ecn;
@@ -357,7 +350,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 	inet_frag_kill(&fq->q);
 
 	WARN_ON(head == NULL);
-	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
+	WARN_ON(head->ip_defrag_offset != 0);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
-- 
cgit v1.2.3


From dd0bed1665d6ca17efd747a90a0bb804b4bf2005 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sat, 31 Mar 2018 21:41:52 +0530
Subject: tls: support for Inline tls record

Facility to register Inline TLS drivers to net/tls. Setup
TLS_HW_RECORD prot to listen on offload device.

Cases handled
- Inline TLS device exists, setup prot for TLS_HW_RECORD
- Atleast one Inline TLS exists, sets TLS_HW_RECORD.
- If non-inline device establish connection, move to TLS_SW_TX

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_main.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 6f5c1146da4a..0d379970960e 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -38,6 +38,7 @@
 #include <linux/highmem.h>
 #include <linux/netdevice.h>
 #include <linux/sched/signal.h>
+#include <linux/inetdevice.h>
 
 #include <net/tls.h>
 
@@ -56,11 +57,14 @@ enum {
 	TLS_SW_TX,
 	TLS_SW_RX,
 	TLS_SW_RXTX,
+	TLS_HW_RECORD,
 	TLS_NUM_CONFIG,
 };
 
 static struct proto *saved_tcpv6_prot;
 static DEFINE_MUTEX(tcpv6_prot_mutex);
+static LIST_HEAD(device_list);
+static DEFINE_MUTEX(device_mutex);
 static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
 static struct proto_ops tls_sw_proto_ops;
 
@@ -241,8 +245,12 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 	lock_sock(sk);
 	sk_proto_close = ctx->sk_proto_close;
 
+	if (ctx->conf == TLS_HW_RECORD)
+		goto skip_tx_cleanup;
+
 	if (ctx->conf == TLS_BASE) {
 		kfree(ctx);
+		ctx = NULL;
 		goto skip_tx_cleanup;
 	}
 
@@ -276,6 +284,11 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
 skip_tx_cleanup:
 	release_sock(sk);
 	sk_proto_close(sk, timeout);
+	/* free ctx for TLS_HW_RECORD, used by tcp_set_state
+	 * for sk->sk_prot->unhash [tls_hw_unhash]
+	 */
+	if (ctx && ctx->conf == TLS_HW_RECORD)
+		kfree(ctx);
 }
 
 static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval,
@@ -493,6 +506,79 @@ static int tls_setsockopt(struct sock *sk, int level, int optname,
 	return do_tls_setsockopt(sk, optname, optval, optlen);
 }
 
+static struct tls_context *create_ctx(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tls_context *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	icsk->icsk_ulp_data = ctx;
+	return ctx;
+}
+
+static int tls_hw_prot(struct sock *sk)
+{
+	struct tls_context *ctx;
+	struct tls_device *dev;
+	int rc = 0;
+
+	mutex_lock(&device_mutex);
+	list_for_each_entry(dev, &device_list, dev_list) {
+		if (dev->feature && dev->feature(dev)) {
+			ctx = create_ctx(sk);
+			if (!ctx)
+				goto out;
+
+			ctx->hash = sk->sk_prot->hash;
+			ctx->unhash = sk->sk_prot->unhash;
+			ctx->sk_proto_close = sk->sk_prot->close;
+			ctx->conf = TLS_HW_RECORD;
+			update_sk_prot(sk, ctx);
+			rc = 1;
+			break;
+		}
+	}
+out:
+	mutex_unlock(&device_mutex);
+	return rc;
+}
+
+static void tls_hw_unhash(struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_device *dev;
+
+	mutex_lock(&device_mutex);
+	list_for_each_entry(dev, &device_list, dev_list) {
+		if (dev->unhash)
+			dev->unhash(dev, sk);
+	}
+	mutex_unlock(&device_mutex);
+	ctx->unhash(sk);
+}
+
+static int tls_hw_hash(struct sock *sk)
+{
+	struct tls_context *ctx = tls_get_ctx(sk);
+	struct tls_device *dev;
+	int err;
+
+	err = ctx->hash(sk);
+	mutex_lock(&device_mutex);
+	list_for_each_entry(dev, &device_list, dev_list) {
+		if (dev->hash)
+			err |= dev->hash(dev, sk);
+	}
+	mutex_unlock(&device_mutex);
+
+	if (err)
+		tls_hw_unhash(sk);
+	return err;
+}
+
 static void build_protos(struct proto *prot, struct proto *base)
 {
 	prot[TLS_BASE] = *base;
@@ -511,15 +597,22 @@ static void build_protos(struct proto *prot, struct proto *base)
 	prot[TLS_SW_RXTX] = prot[TLS_SW_TX];
 	prot[TLS_SW_RXTX].recvmsg	= tls_sw_recvmsg;
 	prot[TLS_SW_RXTX].close		= tls_sk_proto_close;
+
+	prot[TLS_HW_RECORD] = *base;
+	prot[TLS_HW_RECORD].hash	= tls_hw_hash;
+	prot[TLS_HW_RECORD].unhash	= tls_hw_unhash;
+	prot[TLS_HW_RECORD].close	= tls_sk_proto_close;
 }
 
 static int tls_init(struct sock *sk)
 {
 	int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
-	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tls_context *ctx;
 	int rc = 0;
 
+	if (tls_hw_prot(sk))
+		goto out;
+
 	/* The TLS ulp is currently supported only for TCP sockets
 	 * in ESTABLISHED state.
 	 * Supporting sockets in LISTEN state will require us
@@ -530,12 +623,11 @@ static int tls_init(struct sock *sk)
 		return -ENOTSUPP;
 
 	/* allocate tls context */
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	ctx = create_ctx(sk);
 	if (!ctx) {
 		rc = -ENOMEM;
 		goto out;
 	}
-	icsk->icsk_ulp_data = ctx;
 	ctx->setsockopt = sk->sk_prot->setsockopt;
 	ctx->getsockopt = sk->sk_prot->getsockopt;
 	ctx->sk_proto_close = sk->sk_prot->close;
@@ -557,6 +649,22 @@ out:
 	return rc;
 }
 
+void tls_register_device(struct tls_device *device)
+{
+	mutex_lock(&device_mutex);
+	list_add_tail(&device->dev_list, &device_list);
+	mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(tls_register_device);
+
+void tls_unregister_device(struct tls_device *device)
+{
+	mutex_lock(&device_mutex);
+	list_del(&device->dev_list);
+	mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(tls_unregister_device);
+
 static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = {
 	.name			= "tls",
 	.uid			= TCP_ULP_TLS,
-- 
cgit v1.2.3


From e0be6bea2583486ec4ed98e36437d82ea8190811 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sat, 31 Mar 2018 21:41:53 +0530
Subject: ethtool: enable Inline TLS in HW

Ethtool option enables TLS record offload on HW, user
configures the feature for netdev capable of Inline TLS.
This allows user to define custom sk_prot for Inline TLS sock

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index eb55252ca1fb..03416e6dd5d7 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -108,6 +108,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_HW_ESP_BIT] =		 "esp-hw-offload",
 	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",
 	[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] =	 "rx-udp_tunnel-port-offload",
+	[NETIF_F_HW_TLS_RECORD_BIT] =	"tls-hw-record",
 };
 
 static const char
-- 
cgit v1.2.3


From cc35c88ae4db219611e204375d6a4248bc0e84d6 Mon Sep 17 00:00:00 2001
From: Atul Gupta <atul.gupta@chelsio.com>
Date: Sat, 31 Mar 2018 21:41:59 +0530
Subject: crypto : chtls - CPL handler definition

Exchange messages with hardware to program the TLS session
CPL handlers for messages received from chip.

Signed-off-by: Atul Gupta <atul.gupta@chelsio.com>
Signed-off-by: Michael Werner <werner@chelsio.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e7e36433cdb5..57b5468b5139 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -332,6 +332,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 	tcp_update_metrics(sk);
 	tcp_done(sk);
 }
+EXPORT_SYMBOL(tcp_time_wait);
 
 void tcp_twsk_destructor(struct sock *sk)
 {
-- 
cgit v1.2.3


From ec1258903a13d0255695a07280ae5e32303aa6c5 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Fri, 30 Mar 2018 20:34:33 +0300
Subject: ip6_gre: remove redundant 'tunnel' setting in ip6erspan_tap_init()

'tunnel' was already set at the start of ip6erspan_tap_init().

Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_gre.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 22e86557aca4..f8a103bdbd60 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1762,7 +1762,6 @@ static int ip6erspan_tap_init(struct net_device *dev)
 		dev->mtu -= 8;
 
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
-	tunnel = netdev_priv(dev);
 	ip6gre_tnl_link_config(tunnel, 1);
 
 	return 0;
-- 
cgit v1.2.3


From 694aba690de062cf27b28a5e56e7a5a7185b0a1c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 13:16:25 -0700
Subject: ipv4: factorize sk_wmem_alloc updates done by __ip_append_data()

While testing my inet defrag changes, I found that the senders
could spend ~20% of cpu cycles in skb_set_owner_w() updating
sk->sk_wmem_alloc for every fragment they cook.

The solution to this problem is to use alloc_skb() instead
of sock_wmalloc() and manually perform a single sk_wmem_alloc change.

Similar change for IPv6 is provided in following patch.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 66340ab750e6..94cacae76aca 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -876,6 +876,7 @@ static int __ip_append_data(struct sock *sk,
 	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
 	int csummode = CHECKSUM_NONE;
 	struct rtable *rt = (struct rtable *)cork->dst;
+	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
 
 	skb = skb_peek_tail(queue);
@@ -971,11 +972,10 @@ alloc_new_skb:
 						(flags & MSG_DONTWAIT), &err);
 			} else {
 				skb = NULL;
-				if (refcount_read(&sk->sk_wmem_alloc) <=
+				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
 				    2 * sk->sk_sndbuf)
-					skb = sock_wmalloc(sk,
-							   alloclen + hh_len + 15, 1,
-							   sk->sk_allocation);
+					skb = alloc_skb(alloclen + hh_len + 15,
+							sk->sk_allocation);
 				if (unlikely(!skb))
 					err = -ENOBUFS;
 			}
@@ -1033,6 +1033,11 @@ alloc_new_skb:
 			/*
 			 * Put the packet on the pending queue.
 			 */
+			if (!skb->destructor) {
+				skb->destructor = sock_wfree;
+				skb->sk = sk;
+				wmem_alloc_delta += skb->truesize;
+			}
 			__skb_queue_tail(queue, skb);
 			continue;
 		}
@@ -1079,12 +1084,13 @@ alloc_new_skb:
 			skb->len += copy;
 			skb->data_len += copy;
 			skb->truesize += copy;
-			refcount_add(copy, &sk->sk_wmem_alloc);
+			wmem_alloc_delta += copy;
 		}
 		offset += copy;
 		length -= copy;
 	}
 
+	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return 0;
 
 error_efault:
@@ -1092,6 +1098,7 @@ error_efault:
 error:
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
+	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return err;
 }
 
-- 
cgit v1.2.3


From 1f4c6eb24029689a40dceae561e31ff6926d7f0d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 31 Mar 2018 13:16:26 -0700
Subject: ipv6: factorize sk_wmem_alloc updates done by __ip6_append_data()

While testing my inet defrag changes, I found that the senders
could spend ~20% of cpu cycles in skb_set_owner_w() updating
sk->sk_wmem_alloc for every fragment they cook, competing
with TX completion of prior skbs possibly happening on another cpus.

The solution to this problem is to use alloc_skb() instead
of sock_wmalloc() and manually perform a single sk_wmem_alloc change.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 2c7f09c3c39e..323d7a354ffb 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1259,6 +1259,7 @@ static int __ip6_append_data(struct sock *sk,
 	struct ipv6_txoptions *opt = v6_cork->opt;
 	int csummode = CHECKSUM_NONE;
 	unsigned int maxnonfragsize, headersize;
+	unsigned int wmem_alloc_delta = 0;
 
 	skb = skb_peek_tail(queue);
 	if (!skb) {
@@ -1411,11 +1412,10 @@ alloc_new_skb:
 						(flags & MSG_DONTWAIT), &err);
 			} else {
 				skb = NULL;
-				if (refcount_read(&sk->sk_wmem_alloc) <=
+				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
 				    2 * sk->sk_sndbuf)
-					skb = sock_wmalloc(sk,
-							   alloclen + hh_len, 1,
-							   sk->sk_allocation);
+					skb = alloc_skb(alloclen + hh_len,
+							sk->sk_allocation);
 				if (unlikely(!skb))
 					err = -ENOBUFS;
 			}
@@ -1474,6 +1474,11 @@ alloc_new_skb:
 			/*
 			 * Put the packet on the pending queue
 			 */
+			if (!skb->destructor) {
+				skb->destructor = sock_wfree;
+				skb->sk = sk;
+				wmem_alloc_delta += skb->truesize;
+			}
 			__skb_queue_tail(queue, skb);
 			continue;
 		}
@@ -1520,12 +1525,13 @@ alloc_new_skb:
 			skb->len += copy;
 			skb->data_len += copy;
 			skb->truesize += copy;
-			refcount_add(copy, &sk->sk_wmem_alloc);
+			wmem_alloc_delta += copy;
 		}
 		offset += copy;
 		length -= copy;
 	}
 
+	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return 0;
 
 error_efault:
@@ -1533,6 +1539,7 @@ error_efault:
 error:
 	cork->length -= length;
 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
+	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return err;
 }
 
-- 
cgit v1.2.3


From 9ea471320e1302be0fac67c14a7ab7982609fea7 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavo@embeddedor.com>
Date: Fri, 30 Mar 2018 16:05:06 -0500
Subject: Bluetooth: Mark expected switch fall-throughs

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/mgmt.c        | 1 +
 net/bluetooth/rfcomm/sock.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 6e9fc86d8daf..8a80d48d89c4 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4801,6 +4801,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
 		case MGMT_LTK_P256_DEBUG:
 			authenticated = 0x00;
 			type = SMP_LTK_P256_DEBUG;
+			/* fall through */
 		default:
 			continue;
 		}
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 93a3b219db09..d606e9212291 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -221,6 +221,7 @@ static void __rfcomm_sock_close(struct sock *sk)
 	case BT_CONFIG:
 	case BT_CONNECTED:
 		rfcomm_dlc_close(d, 0);
+		/* fall through */
 
 	default:
 		sock_set_flag(sk, SOCK_ZAPPED);
-- 
cgit v1.2.3


From ec1d8ccb07deaf30fd0508af6755364ac47dc08d Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 30 Mar 2018 09:44:00 +0800
Subject: vlan: also check phy_driver ts_info for vlan's real device

Just like function ethtool_get_ts_info(), we should also consider the
phy_driver ts_info call back. For example, driver dp83640.

Fixes: 37dd9255b2f6 ("vlan: Pass ethtool get_ts_info queries to real device.")
Acked-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan_dev.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index f7e83f6d2e64..236452ebbd9e 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -29,6 +29,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
+#include <linux/phy.h>
 #include <net/arp.h>
 #include <net/switchdev.h>
 
@@ -665,8 +666,11 @@ static int vlan_ethtool_get_ts_info(struct net_device *dev,
 {
 	const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops;
+	struct phy_device *phydev = vlan->real_dev->phydev;
 
-	if (ops->get_ts_info) {
+	if (phydev && phydev->drv && phydev->drv->ts_info) {
+		 return phydev->drv->ts_info(phydev, info);
+	} else if (ops->get_ts_info) {
 		return ops->get_ts_info(vlan->real_dev, info);
 	} else {
 		info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
-- 
cgit v1.2.3


From da01ec4ee5e69933641e6017d6461d0ce9dbfac6 Mon Sep 17 00:00:00 2001
From: Li RongQing <lirongqing@baidu.com>
Date: Fri, 30 Mar 2018 10:11:21 +0800
Subject: net: sched: do not emit messages while holding spinlock

move messages emitting out of sch_tree_lock to avoid holding
this lock too long.

Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_htb.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 1ea9846cc6ce..2a4ab7caf553 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1337,6 +1337,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 	struct nlattr *tb[TCA_HTB_MAX + 1];
 	struct tc_htb_opt *hopt;
 	u64 rate64, ceil64;
+	int warn = 0;
 
 	/* extract all subattrs from opt attr */
 	if (!opt)
@@ -1499,13 +1500,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		cl->quantum = min_t(u64, quantum, INT_MAX);
 
 		if (!hopt->quantum && cl->quantum < 1000) {
-			pr_warn("HTB: quantum of class %X is small. Consider r2q change.\n",
-				cl->common.classid);
+			warn = -1;
 			cl->quantum = 1000;
 		}
 		if (!hopt->quantum && cl->quantum > 200000) {
-			pr_warn("HTB: quantum of class %X is big. Consider r2q change.\n",
-				cl->common.classid);
+			warn = 1;
 			cl->quantum = 200000;
 		}
 		if (hopt->quantum)
@@ -1519,6 +1518,10 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 	sch_tree_unlock(sch);
 
+	if (warn)
+		pr_warn("HTB: quantum of class %X is %s. Consider r2q change.\n",
+			    cl->common.classid, (warn == -1 ? "small" : "big"));
+
 	qdisc_class_hash_grow(sch, &q->clhash);
 
 	*arg = (unsigned long)cl;
-- 
cgit v1.2.3


From 6174a30df1b902e1fedbd728f5343937e83e64e6 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 1 Apr 2018 22:40:35 +0800
Subject: route: check sysctl_fib_multipath_use_neigh earlier than hash

Prior to this patch, when one packet is hashed into path [1]
(hash <= nh_upper_bound) and it's neigh is dead, it will try
path [2]. However, if path [2]'s neigh is alive but it's
hash > nh_upper_bound, it will not return this alive path.
This packet will never be sent even if path [2] is alive.

 3.3.3.1/24:
  nexthop via 1.1.1.254 dev eth1 weight 1 <--[1] (dead neigh)
  nexthop via 2.2.2.254 dev eth2 weight 1 <--[2]

With sysctl_fib_multipath_use_neigh set is supposed to find an
available path respecting to the l3/l4 hash. But if there is
no available route with this hash, it should at least return
an alive route even with other hash.

This patch is to fix it by processing fib_multipath_use_neigh
earlier than the hash check, so that it will at least return
an alive route if there is when fib_multipath_use_neigh is
enabled. It's also compatible with before when there are alive
routes with the l3/l4 hash.

Fixes: a6db4494d218 ("net: ipv4: Consider failed nexthops in multipath routes")
Reported-by: Jianlin Shi <jishi@redhat.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_semantics.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e7c602c600ac..c27122f01b87 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1746,18 +1746,20 @@ void fib_select_multipath(struct fib_result *res, int hash)
 	bool first = false;
 
 	for_nexthops(fi) {
+		if (net->ipv4.sysctl_fib_multipath_use_neigh) {
+			if (!fib_good_nh(nh))
+				continue;
+			if (!first) {
+				res->nh_sel = nhsel;
+				first = true;
+			}
+		}
+
 		if (hash > atomic_read(&nh->nh_upper_bound))
 			continue;
 
-		if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
-		    fib_good_nh(nh)) {
-			res->nh_sel = nhsel;
-			return;
-		}
-		if (!first) {
-			res->nh_sel = nhsel;
-			first = true;
-		}
+		res->nh_sel = nhsel;
+		return;
 	} endfor_nexthops(fi);
 }
 #endif
-- 
cgit v1.2.3


From 6e00f7dd5e4edc2443f030b226f66fe4f1267667 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 1 Apr 2018 21:57:59 -0700
Subject: ipv6: frags: fix /proc/sys/net/ipv6/ip6frag_low_thresh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I forgot to change ip6frag_low_thresh proc_handler
from proc_dointvec_minmax to proc_doulongvec_minmax

Fixes: 3e67f106f619 ("inet: frags: break the 2GB limit for frags storage")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Maciej Żenczykowski <maze@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/reassembly.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 7b52efc63f6a..70e4a578b2fb 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -564,7 +564,7 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = {
 		.data		= &init_net.ipv6.frags.low_thresh,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.ipv6.frags.high_thresh
 	},
-- 
cgit v1.2.3