From adf08d481b52824c87af028fc74dc692d179f7f4 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Oct 2016 19:07:54 +0900
Subject: regmap: include <linux/delay.h> from include/linux/regmap.h

The readx_poll_timeout() macro calls usleep_range(), which is
declared in <linux/delay.h>.

Make include/linux/regmap.h include <linux/delay.h>, like
include/linux/iopoll.h does.  Otherwise, users of the macro
will see "implicit declaration of function 'usleep_range'" error.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 9adc7b21903d..e5157e39ff22 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -15,6 +15,7 @@
 
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <linux/delay.h>
 #include <linux/err.h>
 #include <linux/bug.h>
 #include <linux/lockdep.h>
-- 
cgit v1.2.3


From 0189efb8f4f830b9ac7a7c56c0c6e260859e950d Mon Sep 17 00:00:00 2001
From: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Date: Thu, 13 Oct 2016 22:57:02 +0300
Subject: qed*: Fix Kconfig dependencies with INFINIBAND_QEDR

The qedr driver would require a tristate Kconfig option [to allow
it to compile as a module], and toward that end we've added the
INFINIBAND_QEDR option. But as we've made the compilation of the
qed/qede infrastructure required for RoCE dependent on the option
we'd be facing linking difficulties in case that QED=y or QEDE=y,
and INFINIBAND_QEDR=m.

To resolve this, we seperate between the INFINIBAND_QEDR option
and the infrastructure support in qed/qede by introducing a new
QED_RDMA option which would be selected by INFINIBAND_QEDR but would
be a boolean instead of a tristate; Following that, the qed/qede is
fixed based on this new option so that all config combinations would
be supported.

Fixes: cee9fbd8e2e9 ("qede: add qedr framework")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yuval Mintz <Yuval.Mintz@caviumnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qede_roce.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/qed/qede_roce.h b/include/linux/qed/qede_roce.h
index 99fbe6d55acb..f48d64b0e2fb 100644
--- a/include/linux/qed/qede_roce.h
+++ b/include/linux/qed/qede_roce.h
@@ -68,7 +68,7 @@ void qede_roce_unregister_driver(struct qedr_driver *drv);
 
 bool qede_roce_supported(struct qede_dev *dev);
 
-#if IS_ENABLED(CONFIG_INFINIBAND_QEDR)
+#if IS_ENABLED(CONFIG_QED_RDMA)
 int qede_roce_dev_add(struct qede_dev *dev);
 void qede_roce_dev_event_open(struct qede_dev *dev);
 void qede_roce_dev_event_close(struct qede_dev *dev);
-- 
cgit v1.2.3


From a04a480d4392ea6efd117be2de564117b2a009c0 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Sun, 16 Oct 2016 20:02:52 -0700
Subject: net: Require exact match for TCP socket lookups if dif is l3mdev

Currently, socket lookups for l3mdev (vrf) use cases can match a socket
that is bound to a port but not a device (ie., a global socket). If the
sysctl tcp_l3mdev_accept is not set this leads to ack packets going out
based on the main table even though the packet came in from an L3 domain.
The end result is that the connection does not establish creating
confusion for users since the service is running and a socket shows in
ss output. Fix by requiring an exact dif to sk_bound_dev_if match if the
skb came through an interface enslaved to an l3mdev device and the
tcp_l3mdev_accept is not set.

skb's through an l3mdev interface are marked by setting a flag in
inet{6}_skb_parm. The IPv6 variant is already set; this patch adds the
flag for IPv4. Using an skb flag avoids a device lookup on the dif. The
flag is set in the VRF driver using the IP{6}CB macros. For IPv4, the
inet_skb_parm struct is moved in the cb per commit 971f10eca186, so the
match function in the TCP stack needs to use TCP_SKB_CB. For IPv6, the
move is done after the socket lookup, so IP6CB is used.

The flags field in inet_skb_parm struct needs to be increased to add
another flag. There is currently a 1-byte hole following the flags,
so it can be expanded to u16 without increasing the size of the struct.

Fixes: 193125dbd8eb ("net: Introduce VRF device driver")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 7e9a789be5e0..ca1ad9ebbc92 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -123,12 +123,12 @@ struct inet6_skb_parm {
 };
 
 #if defined(CONFIG_NET_L3_MASTER_DEV)
-static inline bool skb_l3mdev_slave(__u16 flags)
+static inline bool ipv6_l3mdev_skb(__u16 flags)
 {
 	return flags & IP6SKB_L3SLAVE;
 }
 #else
-static inline bool skb_l3mdev_slave(__u16 flags)
+static inline bool ipv6_l3mdev_skb(__u16 flags)
 {
 	return false;
 }
@@ -139,11 +139,22 @@ static inline bool skb_l3mdev_slave(__u16 flags)
 
 static inline int inet6_iif(const struct sk_buff *skb)
 {
-	bool l3_slave = skb_l3mdev_slave(IP6CB(skb)->flags);
+	bool l3_slave = ipv6_l3mdev_skb(IP6CB(skb)->flags);
 
 	return l3_slave ? skb->skb_iif : IP6CB(skb)->iif;
 }
 
+/* can not be used in TCP layer after tcp_v6_fill_cb */
+static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb)
+{
+#if defined(CONFIG_NET_L3_MASTER_DEV)
+	if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
+	    ipv6_l3mdev_skb(IP6CB(skb)->flags))
+		return true;
+#endif
+	return false;
+}
+
 struct tcp6_request_sock {
 	struct tcp_request_sock	  tcp6rsk_tcp;
 };
-- 
cgit v1.2.3


From e4961b0768852d9eb7383e1a5df178eacb714656 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@mellanox.com>
Date: Wed, 19 Oct 2016 16:57:08 +0300
Subject: net: core: Correctly iterate over lower adjacency list

Tamir reported the following trace when processing ARP requests received
via a vlan device on top of a VLAN-aware bridge:

 NMI watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [swapper/1:0]
[...]
 CPU: 1 PID: 0 Comm: swapper/1 Tainted: G        W       4.8.0-rc7 #1
 Hardware name: Mellanox Technologies Ltd. "MSN2100-CB2F"/"SA001017", BIOS 5.6.5 06/07/2016
 task: ffff88017edfea40 task.stack: ffff88017ee10000
 RIP: 0010:[<ffffffff815dcc73>]  [<ffffffff815dcc73>] netdev_all_lower_get_next_rcu+0x33/0x60
[...]
 Call Trace:
  <IRQ>
  [<ffffffffa015de0a>] mlxsw_sp_port_lower_dev_hold+0x5a/0xa0 [mlxsw_spectrum]
  [<ffffffffa016f1b0>] mlxsw_sp_router_netevent_event+0x80/0x150 [mlxsw_spectrum]
  [<ffffffff810ad07a>] notifier_call_chain+0x4a/0x70
  [<ffffffff810ad13a>] atomic_notifier_call_chain+0x1a/0x20
  [<ffffffff815ee77b>] call_netevent_notifiers+0x1b/0x20
  [<ffffffff815f2eb6>] neigh_update+0x306/0x740
  [<ffffffff815f38ce>] neigh_event_ns+0x4e/0xb0
  [<ffffffff8165ea3f>] arp_process+0x66f/0x700
  [<ffffffff8170214c>] ? common_interrupt+0x8c/0x8c
  [<ffffffff8165ec29>] arp_rcv+0x139/0x1d0
  [<ffffffff816e505a>] ? vlan_do_receive+0xda/0x320
  [<ffffffff815e3794>] __netif_receive_skb_core+0x524/0xab0
  [<ffffffff815e6830>] ? dev_queue_xmit+0x10/0x20
  [<ffffffffa06d612d>] ? br_forward_finish+0x3d/0xc0 [bridge]
  [<ffffffffa06e5796>] ? br_handle_vlan+0xf6/0x1b0 [bridge]
  [<ffffffff815e3d38>] __netif_receive_skb+0x18/0x60
  [<ffffffff815e3dc0>] netif_receive_skb_internal+0x40/0xb0
  [<ffffffff815e3e4c>] netif_receive_skb+0x1c/0x70
  [<ffffffffa06d7856>] br_pass_frame_up+0xc6/0x160 [bridge]
  [<ffffffffa06d63d7>] ? deliver_clone+0x37/0x50 [bridge]
  [<ffffffffa06d656c>] ? br_flood+0xcc/0x160 [bridge]
  [<ffffffffa06d7b14>] br_handle_frame_finish+0x224/0x4f0 [bridge]
  [<ffffffffa06d7f94>] br_handle_frame+0x174/0x300 [bridge]
  [<ffffffff815e3599>] __netif_receive_skb_core+0x329/0xab0
  [<ffffffff81374815>] ? find_next_bit+0x15/0x20
  [<ffffffff8135e802>] ? cpumask_next_and+0x32/0x50
  [<ffffffff810c9968>] ? load_balance+0x178/0x9b0
  [<ffffffff815e3d38>] __netif_receive_skb+0x18/0x60
  [<ffffffff815e3dc0>] netif_receive_skb_internal+0x40/0xb0
  [<ffffffff815e3e4c>] netif_receive_skb+0x1c/0x70
  [<ffffffffa01544a1>] mlxsw_sp_rx_listener_func+0x61/0xb0 [mlxsw_spectrum]
  [<ffffffffa005c9f7>] mlxsw_core_skb_receive+0x187/0x200 [mlxsw_core]
  [<ffffffffa007332a>] mlxsw_pci_cq_tasklet+0x63a/0x9b0 [mlxsw_pci]
  [<ffffffff81091986>] tasklet_action+0xf6/0x110
  [<ffffffff81704556>] __do_softirq+0xf6/0x280
  [<ffffffff8109213f>] irq_exit+0xdf/0xf0
  [<ffffffff817042b4>] do_IRQ+0x54/0xd0
  [<ffffffff8170214c>] common_interrupt+0x8c/0x8c

The problem is that netdev_all_lower_get_next_rcu() never advances the
iterator, thereby causing the loop over the lower adjacency list to run
forever.

Fix this by advancing the iterator and avoid the infinite loop.

Fixes: 7ce856aaaf13 ("mlxsw: spectrum: Add couple of lower device helper functions")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: Tamir Winetroub <tamirw@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Acked-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 136ae6bbe81e..465e128699a1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3877,7 +3877,7 @@ struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
 	     ldev = netdev_all_lower_get_next(dev, &(iter)))
 
 #define netdev_for_each_all_lower_dev_rcu(dev, ldev, iter) \
-	for (iter = (dev)->all_adj_list.lower.next, \
+	for (iter = &(dev)->all_adj_list.lower, \
 	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)); \
 	     ldev; \
 	     ldev = netdev_all_lower_get_next_rcu(dev, &(iter)))
-- 
cgit v1.2.3


From d33fd776f992332be110f6ceca6dad60cb5d513f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Oct 2016 15:51:28 +1100
Subject: iomap: add IOMAP_REPORT

This allows the file system to tell a FIEMAP from a read operation, and thus
avoids the need to report flags that aren't actually used in the read path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 include/linux/iomap.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e63e288dee83..7892f55a1866 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -19,11 +19,15 @@ struct vm_fault;
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
 
 /*
- * Flags for iomap mappings:
+ * Flags for all iomap mappings:
  */
-#define IOMAP_F_MERGED	0x01	/* contains multiple blocks/extents */
-#define IOMAP_F_SHARED	0x02	/* block shared with another file */
-#define IOMAP_F_NEW	0x04	/* blocks have been newly allocated */
+#define IOMAP_F_NEW	0x01	/* blocks have been newly allocated */
+
+/*
+ * Flags that only need to be reported for IOMAP_REPORT requests:
+ */
+#define IOMAP_F_MERGED	0x10	/* contains multiple blocks/extents */
+#define IOMAP_F_SHARED	0x20	/* block shared with another file */
 
 /*
  * Magic value for blkno:
@@ -42,8 +46,9 @@ struct iomap {
 /*
  * Flags for iomap_begin / iomap_end.  No flag implies a read.
  */
-#define IOMAP_WRITE		(1 << 0)
-#define IOMAP_ZERO		(1 << 1)
+#define IOMAP_WRITE		(1 << 0) /* writing, must allocate blocks */
+#define IOMAP_ZERO		(1 << 1) /* zeroing operation, may skip holes */
+#define IOMAP_REPORT		(1 << 2) /* report extent status, e.g. FIEMAP */
 
 struct iomap_ops {
 	/*
-- 
cgit v1.2.3


From fcd91dd449867c6bfe56a81cabba76b829fd05cd Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 20 Oct 2016 15:58:02 +0200
Subject: net: add recursion limit to GRO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, GRO can do unlimited recursion through the gro_receive
handlers.  This was fixed for tunneling protocols by limiting tunnel GRO
to one level with encap_mark, but both VLAN and TEB still have this
problem.  Thus, the kernel is vulnerable to a stack overflow, if we
receive a packet composed entirely of VLAN headers.

This patch adds a recursion counter to the GRO layer to prevent stack
overflow.  When a gro_receive function hits the recursion limit, GRO is
aborted for this skb and it is processed normally.  This recursion
counter is put in the GRO CB, but could be turned into a percpu counter
if we run out of space in the CB.

Thanks to Vladimír Beneš <vbenes@redhat.com> for the initial bug report.

Fixes: CVE-2016-7039
Fixes: 9b174d88c257 ("net: Add Transparent Ethernet Bridging GRO support.")
Fixes: 66e5133f19e9 ("vlan: Add GRO support for non hardware accelerated vlan")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Jiri Benc <jbenc@redhat.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 465e128699a1..91ee3643ccc8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2169,7 +2169,10 @@ struct napi_gro_cb {
 	/* Used to determine if flush_id can be ignored */
 	u8	is_atomic:1;
 
-	/* 5 bit hole */
+	/* Number of gro_receive callbacks this packet already went through */
+	u8 recursion_counter:4;
+
+	/* 1 bit hole */
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
@@ -2180,6 +2183,40 @@ struct napi_gro_cb {
 
 #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
 
+#define GRO_RECURSION_LIMIT 15
+static inline int gro_recursion_inc_test(struct sk_buff *skb)
+{
+	return ++NAPI_GRO_CB(skb)->recursion_counter == GRO_RECURSION_LIMIT;
+}
+
+typedef struct sk_buff **(*gro_receive_t)(struct sk_buff **, struct sk_buff *);
+static inline struct sk_buff **call_gro_receive(gro_receive_t cb,
+						struct sk_buff **head,
+						struct sk_buff *skb)
+{
+	if (unlikely(gro_recursion_inc_test(skb))) {
+		NAPI_GRO_CB(skb)->flush |= 1;
+		return NULL;
+	}
+
+	return cb(head, skb);
+}
+
+typedef struct sk_buff **(*gro_receive_sk_t)(struct sock *, struct sk_buff **,
+					     struct sk_buff *);
+static inline struct sk_buff **call_gro_receive_sk(gro_receive_sk_t cb,
+						   struct sock *sk,
+						   struct sk_buff **head,
+						   struct sk_buff *skb)
+{
+	if (unlikely(gro_recursion_inc_test(skb))) {
+		NAPI_GRO_CB(skb)->flush |= 1;
+		return NULL;
+	}
+
+	return cb(sk, head, skb);
+}
+
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
 	struct net_device	*dev;	/* NULL is wildcarded here	     */
-- 
cgit v1.2.3


From 0e191827383a6503a3bc547e63c74ff093f450f5 Mon Sep 17 00:00:00 2001
From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Fri, 21 Oct 2016 04:43:42 -0400
Subject: qed*: Reduce the memory footprint for Rx path

With the current default values for Rx path i.e., 8 queues of 8Kb entries
each with 4Kb size, interface will consume 256Mb for Rx. The default values
causing the driver probe to fail when the system memory is low. Based on
the perforamnce results, rx-ring count value of 1Kb gives the comparable
performance with Rx coalesce timeout of 12 seconds. Updating the default
values.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/qed/qed_if.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index f9ae903bbb84..8978a60371f4 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -146,6 +146,7 @@ enum qed_led_mode {
 #define DIRECT_REG_RD(reg_addr) readl((void __iomem *)(reg_addr))
 
 #define QED_COALESCE_MAX 0xFF
+#define QED_DEFAULT_RX_USECS 12
 
 /* forward */
 struct qed_dev;
-- 
cgit v1.2.3


From f1caa61df2a3dc4c58316295c5dc5edba4c68d85 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@codeaurora.org>
Date: Mon, 24 Oct 2016 00:31:31 -0400
Subject: ACPI/PCI: pci_link: penalize SCI correctly

Ondrej reported that IRQs stopped working in v4.7 on several
platforms.  A typical scenario, from Ondrej's VT82C694X/694X, is:

ACPI: Using PIC for interrupt routing
ACPI: PCI Interrupt Link [LNKA] (IRQs 1 3 4 5 6 7 10 *11 12 14 15)
ACPI: No IRQ available for PCI Interrupt Link [LNKA]
8139too 0000:00:0f.0: PCI INT A: no GSI

We're using PIC routing, so acpi_irq_balance == 0, and LNKA is already
active at IRQ 11. In that case, acpi_pci_link_allocate() only tries
to use the active IRQ (IRQ 11) which also happens to be the SCI.

We should penalize the SCI by PIRQ_PENALTY_PCI_USING, but
irq_get_trigger_type(11) returns something other than
IRQ_TYPE_LEVEL_LOW, so we penalize it by PIRQ_PENALTY_ISA_ALWAYS
instead, which makes acpi_pci_link_allocate() assume the IRQ isn't
available and give up.

Add acpi_penalize_sci_irq() so platforms can tell us the SCI IRQ,
trigger, and polarity directly and we don't have to depend on
irq_get_trigger_type().

Fixes: 103544d86976 (ACPI,PCI,IRQ: reduce resource requirements)
Link: http://lkml.kernel.org/r/201609251512.05657.linux@rainbow-software.org
Reported-by: Ondrej Zary <linux@rainbow-software.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
Tested-by: Jonathan Liu <net147@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index ddbeda6dbdc8..689a8b9b9c8f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -326,6 +326,7 @@ struct pci_dev;
 int acpi_pci_irq_enable (struct pci_dev *dev);
 void acpi_penalize_isa_irq(int irq, int active);
 bool acpi_isa_irq_available(int irq);
+void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
 void acpi_pci_irq_disable (struct pci_dev *dev);
 
 extern int ec_read(u8 addr, u8 *val);
-- 
cgit v1.2.3


From 8ef4227615e158faa4ee85a1d6466782f7e22f2f Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 24 Oct 2016 15:27:59 +1000
Subject: x86/io: add interface to reserve io memtype for a resource range.
 (v1.1)

A recent change to the mm code in:
87744ab3832b mm: fix cache mode tracking in vm_insert_mixed()

started enforcing checking the memory type against the registered list for
amixed pfn insertion mappings. It happens that the drm drivers for a number
of gpus relied on this being broken. Currently the driver only inserted
VRAM mappings into the tracking table when they came from the kernel,
and userspace mappings never landed in the table. This led to a regression
where all the mapping end up as UC instead of WC now.

I've considered a number of solutions but since this needs to be fixed
in fixes and not next, and some of the solutions were going to introduce
overhead that hadn't been there before I didn't consider them viable at
this stage. These mainly concerned hooking into the TTM io reserve APIs,
but these API have a bunch of fast paths I didn't want to unwind to add
this to.

The solution I've decided on is to add a new API like the arch_phys_wc
APIs (these would have worked but wc_del didn't take a range), and
use them from the drivers to add a WC compatible mapping to the table
for all VRAM on those GPUs. This means we can then create userspace
mapping that won't get degraded to UC.

v1.1: use CONFIG_X86_PAT + add some comments in io.h

Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: x86@kernel.org
Cc: mcgrof@suse.com
Cc: Dan Williams <dan.j.williams@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/io.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io.h b/include/linux/io.h
index e2c8419278c1..82ef36eac8a1 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -141,4 +141,26 @@ enum {
 void *memremap(resource_size_t offset, size_t size, unsigned long flags);
 void memunmap(void *addr);
 
+/*
+ * On x86 PAT systems we have memory tracking that keeps track of
+ * the allowed mappings on memory ranges. This tracking works for
+ * all the in-kernel mapping APIs (ioremap*), but where the user
+ * wishes to map a range from a physical device into user memory
+ * the tracking won't be updated. This API is to be used by
+ * drivers which remap physical device pages into userspace,
+ * and wants to make sure they are mapped WC and not UC.
+ */
+#ifndef arch_io_reserve_memtype_wc
+static inline int arch_io_reserve_memtype_wc(resource_size_t base,
+					     resource_size_t size)
+{
+	return 0;
+}
+
+static inline void arch_io_free_memtype_wc(resource_size_t base,
+					   resource_size_t size)
+{
+}
+#endif
+
 #endif /* _LINUX_IO_H */
-- 
cgit v1.2.3


From 293de7dee4f9602676846dbeb84b1580123306b4 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <sthemmin@microsoft.com>
Date: Sun, 23 Oct 2016 09:28:29 -0700
Subject: doc: update docbook annotations for socket and skb

The skbuff and sock structure both had missing parameter annotation
values.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 601258f6e621..32810f279f8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -936,6 +936,7 @@ struct sk_buff_fclones {
 
 /**
  *	skb_fclone_busy - check if fclone is busy
+ *	@sk: socket
  *	@skb: buffer
  *
  * Returns true if skb is a fast clone, and its clone is not freed.
-- 
cgit v1.2.3


From 9dcb8b685fc30813b35ab4b4bf39244430753190 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 26 Oct 2016 10:15:30 -0700
Subject: mm: remove per-zone hashtable of bitlock waitqueues

The per-zone waitqueues exist because of a scalability issue with the
page waitqueues on some NUMA machines, but it turns out that they hurt
normal loads, and now with the vmalloced stacks they also end up
breaking gfs2 that uses a bit_wait on a stack object:

     wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE)

where 'gh' can be a reference to the local variable 'mount_gh' on the
stack of fill_super().

The reason the per-zone hash table breaks for this case is that there is
no "zone" for virtual allocations, and trying to look up the physical
page to get at it will fail (with a BUG_ON()).

It turns out that I actually complained to the mm people about the
per-zone hash table for another reason just a month ago: the zone lookup
also hurts the regular use of "unlock_page()" a lot, because the zone
lookup ends up forcing several unnecessary cache misses and generates
horrible code.

As part of that earlier discussion, we had a much better solution for
the NUMA scalability issue - by just making the page lock have a
separate contention bit, the waitqueue doesn't even have to be looked at
for the normal case.

Peter Zijlstra already has a patch for that, but let's see if anybody
even notices.  In the meantime, let's fix the actual gfs2 breakage by
simplifying the bitlock waitqueues and removing the per-zone issue.

Reported-by: Andreas Gruenbacher <agruenba@redhat.com>
Tested-by: Bob Peterson <rpeterso@redhat.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 30 ++----------------------------
 1 file changed, 2 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7f2ae99e5daf..0f088f3a2fed 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -440,33 +440,7 @@ struct zone {
 	seqlock_t		span_seqlock;
 #endif
 
-	/*
-	 * wait_table		-- the array holding the hash table
-	 * wait_table_hash_nr_entries	-- the size of the hash table array
-	 * wait_table_bits	-- wait_table_size == (1 << wait_table_bits)
-	 *
-	 * The purpose of all these is to keep track of the people
-	 * waiting for a page to become available and make them
-	 * runnable again when possible. The trouble is that this
-	 * consumes a lot of space, especially when so few things
-	 * wait on pages at a given time. So instead of using
-	 * per-page waitqueues, we use a waitqueue hash table.
-	 *
-	 * The bucket discipline is to sleep on the same queue when
-	 * colliding and wake all in that wait queue when removing.
-	 * When something wakes, it must check to be sure its page is
-	 * truly available, a la thundering herd. The cost of a
-	 * collision is great, but given the expected load of the
-	 * table, they should be so rare as to be outweighed by the
-	 * benefits from the saved space.
-	 *
-	 * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
-	 * primary users of these fields, and in mm/page_alloc.c
-	 * free_area_init_core() performs the initialization of them.
-	 */
-	wait_queue_head_t	*wait_table;
-	unsigned long		wait_table_hash_nr_entries;
-	unsigned long		wait_table_bits;
+	int initialized;
 
 	/* Write-intensive fields used from the page allocator */
 	ZONE_PADDING(_pad1_)
@@ -546,7 +520,7 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
 
 static inline bool zone_is_initialized(struct zone *zone)
 {
-	return !!zone->wait_table;
+	return zone->initialized;
 }
 
 static inline bool zone_is_empty(struct zone *zone)
-- 
cgit v1.2.3


From c0a0aba8e478229b2f0956918542152fbad3f794 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 27 Oct 2016 17:46:38 -0700
Subject: kconfig.h: remove config_enabled() macro

The use of config_enabled() is ambiguous.  For config options,
IS_ENABLED(), IS_REACHABLE(), etc.  will make intention clearer.
Sometimes config_enabled() has been used for non-config options because
it is useful to check whether the given symbol is defined or not.

I have been tackling on deprecating config_enabled(), and now is the
time to finish this work.

Some new users have appeared for v4.9-rc1, but it is trivial to replace
them:

 - arch/x86/mm/kaslr.c
  replace config_enabled() with IS_ENABLED() because
  CONFIG_X86_ESPFIX64 and CONFIG_EFI are boolean.

 - include/asm-generic/export.h
  replace config_enabled() with __is_defined().

Then, config_enabled() can be removed now.

Going forward, please use IS_ENABLED(), IS_REACHABLE(), etc. for config
options, and __is_defined() for non-config symbols.

Link: http://lkml.kernel.org/r/1476616078-32252-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Michal Marek <mmarek@suse.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kconfig.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index 15ec117ec537..8f2e059e4d45 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -31,7 +31,6 @@
  * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when
  * the last step cherry picks the 2nd arg, we get a zero.
  */
-#define config_enabled(cfg)		___is_defined(cfg)
 #define __is_defined(x)			___is_defined(x)
 #define ___is_defined(val)		____is_defined(__ARG_PLACEHOLDER_##val)
 #define ____is_defined(arg1_or_junk)	__take_second_arg(arg1_or_junk 1, 0)
@@ -41,13 +40,13 @@
  * otherwise. For boolean options, this is equivalent to
  * IS_ENABLED(CONFIG_FOO).
  */
-#define IS_BUILTIN(option) config_enabled(option)
+#define IS_BUILTIN(option) __is_defined(option)
 
 /*
  * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0
  * otherwise.
  */
-#define IS_MODULE(option) config_enabled(option##_MODULE)
+#define IS_MODULE(option) __is_defined(option##_MODULE)
 
 /*
  * IS_REACHABLE(CONFIG_FOO) evaluates to 1 if the currently compiled
-- 
cgit v1.2.3


From 73f907fd5fa56b0066d199bdd7126bbd04f6cd7b Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Mon, 24 Oct 2016 16:46:20 +0200
Subject: mtd: nand: Fix data interface configuration logic

When changing from one data interface setting to another, one has to
ensure a specific sequence which is described in the ONFI spec.

One of these constraints is that the CE line has go high after a reset
before a command can be sent with the new data interface setting, which
is not guaranteed by the current implementation.

Rework the nand_reset() function and all the call sites to make sure the
CE line is asserted and released when required.

Also make sure to actually apply the new data interface setting on the
first die.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Fixes: d8e725dd8311 ("mtd: nand: automate NAND timings selection")
Reviewed-by: Sascha Hauer <s.hauer@pengutronix.de>
Tested-by: Marc Gonzalez <marc_gonzalez@sigmadesigns.com>
---
 include/linux/mtd/nand.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index c5d3d5024fc8..d8905a229f34 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1184,7 +1184,7 @@ int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 			   int page);
 
 /* Reset and initialize a NAND device */
-int nand_reset(struct nand_chip *chip);
+int nand_reset(struct nand_chip *chip, int chipnr);
 
 /* Free resources held by the NAND device */
 void nand_cleanup(struct nand_chip *chip);
-- 
cgit v1.2.3


From 5aab90ce1ec449912a2ebc4d45e0c85dac29e9dd Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 26 Oct 2016 11:48:24 +0200
Subject: perf/powerpc: Don't call perf_event_disable() from atomic context

The trinity syscall fuzzer triggered following WARN() on powerpc:

  WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278
  ...
  NIP [c00000000093aedc] .hw_breakpoint_handler+0x28c/0x2b0
  LR [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0
  Call Trace:
  [c0000002f7933580] [c00000000093aed8] .hw_breakpoint_handler+0x288/0x2b0 (unreliable)
  [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0
  [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0
  [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0
  [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100
  [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48

Followed by a lockdep warning:

  ===============================
  [ INFO: suspicious RCU usage. ]
  4.8.0-rc5+ #7 Tainted: G        W
  -------------------------------
  ./include/linux/rcupdate.h:556 Illegal context switch in RCU read-side critical section!

  other info that might help us debug this:

  rcu_scheduler_active = 1, debug_locks = 0
  2 locks held by ls/2998:
   #0:  (rcu_read_lock){......}, at: [<c0000000000f6a00>] .__atomic_notifier_call_chain+0x0/0x1c0
   #1:  (rcu_read_lock){......}, at: [<c00000000093ac50>] .hw_breakpoint_handler+0x0/0x2b0

  stack backtrace:
  CPU: 9 PID: 2998 Comm: ls Tainted: G        W       4.8.0-rc5+ #7
  Call Trace:
  [c0000002f7933150] [c00000000094b1f8] .dump_stack+0xe0/0x14c (unreliable)
  [c0000002f79331e0] [c00000000013c468] .lockdep_rcu_suspicious+0x138/0x180
  [c0000002f7933270] [c0000000001005d8] .___might_sleep+0x278/0x2e0
  [c0000002f7933300] [c000000000935584] .mutex_lock_nested+0x64/0x5a0
  [c0000002f7933410] [c00000000023084c] .perf_event_ctx_lock_nested+0x16c/0x380
  [c0000002f7933500] [c000000000230a80] .perf_event_disable+0x20/0x60
  [c0000002f7933580] [c00000000093aeec] .hw_breakpoint_handler+0x29c/0x2b0
  [c0000002f7933630] [c0000000000f671c] .notifier_call_chain+0x7c/0xf0
  [c0000002f79336d0] [c0000000000f6abc] .__atomic_notifier_call_chain+0xbc/0x1c0
  [c0000002f7933780] [c0000000000f6c40] .notify_die+0x70/0xd0
  [c0000002f7933820] [c00000000001a74c] .do_break+0x4c/0x100
  [c0000002f7933920] [c0000000000089fc] handle_dabr_fault+0x14/0x48

While it looks like the first WARN() is probably valid, the other one is
triggered by disabling event via perf_event_disable() from atomic context.

The event is disabled here in case we were not able to emulate
the instruction that hit the breakpoint. By disabling the event
we unschedule the event and make sure it's not scheduled back.

But we can't call perf_event_disable() from atomic context, instead
we need to use the event's pending_disable irq_work method to disable it.

Reported-by: Jan Stancek <jstancek@redhat.com>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161026094824.GA21397@krava
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 060d0ede88df..4741ecdb9817 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1257,6 +1257,7 @@ extern u64 perf_swevent_set_period(struct perf_event *event);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_disable_local(struct perf_event *event);
+extern void perf_event_disable_inatomic(struct perf_event *event);
 extern void perf_event_task_tick(void);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
-- 
cgit v1.2.3


From 72193a953ada9058e46970838cc42cbd18bf4eba Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Fri, 28 Oct 2016 11:38:53 +0100
Subject: regmap: Rename ret variable in regmap_read_poll_timeout

As almost all of the callers of the regmap_read_poll_timeout macro
will include a local ret variable we will always get a Sparse warning
about the duplication of the ret variable:

warning: symbol 'ret' shadows an earlier one

Simply rename the ret variable in the marco to pollret to make this
significantly less likely to happen.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 9adc7b21903d..fc9e3c576f9c 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -116,22 +116,22 @@ struct reg_sequence {
 #define regmap_read_poll_timeout(map, addr, val, cond, sleep_us, timeout_us) \
 ({ \
 	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
-	int ret; \
+	int pollret; \
 	might_sleep_if(sleep_us); \
 	for (;;) { \
-		ret = regmap_read((map), (addr), &(val)); \
-		if (ret) \
+		pollret = regmap_read((map), (addr), &(val)); \
+		if (pollret) \
 			break; \
 		if (cond) \
 			break; \
 		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
-			ret = regmap_read((map), (addr), &(val)); \
+			pollret = regmap_read((map), (addr), &(val)); \
 			break; \
 		} \
 		if (sleep_us) \
 			usleep_range((sleep_us >> 2) + 1, sleep_us); \
 	} \
-	ret ?: ((cond) ? 0 : -ETIMEDOUT); \
+	pollret ?: ((cond) ? 0 : -ETIMEDOUT); \
 })
 
 #ifdef CONFIG_REGMAP
-- 
cgit v1.2.3


From b47bd6ea40636362a8b6605de51207cc387ba0b8 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@mellanox.com>
Date: Tue, 25 Oct 2016 18:36:24 +0300
Subject: {net, ib}/mlx5: Make cache line size determination at runtime.

ARM 64B cache line systems have L1_CACHE_BYTES set to 128.
cache_line_size() will return the correct size.

Fixes: cf50b5efa2fe('net/mlx5_core/ib: New device capabilities
handling.')
Signed-off-by: Daniel Jurgens <danielj@mellanox.com>

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 85c4786427e4..5dbda60a09f4 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -625,10 +625,6 @@ struct mlx5_db {
 	int			index;
 };
 
-enum {
-	MLX5_DB_PER_PAGE = PAGE_SIZE / L1_CACHE_BYTES,
-};
-
 enum {
 	MLX5_COMP_EQ_SIZE = 1024,
 };
@@ -638,13 +634,6 @@ enum {
 	MLX5_PTYS_EN = 1 << 2,
 };
 
-struct mlx5_db_pgdir {
-	struct list_head	list;
-	DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE);
-	__be32		       *db_page;
-	dma_addr_t		db_dma;
-};
-
 typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
 
 struct mlx5_cmd_work_ent {
-- 
cgit v1.2.3


From 05ac2c0b7438ea08c5d54b48797acf9b22cb2f6f Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Tue, 25 Oct 2016 18:36:33 +0300
Subject: net/mlx5: Fix race between PCI error handlers and health work

Currently there is a race between the health care work and the kernel
pci error handlers because both of them detect the error, the first one
to be called will do the error handling.
There is a chance that health care will disable the pci after resuming
pci slot.
Also create a separate WQ because now we will have two types of health
works, one for the error detection and one for the recovery.

Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver')
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 5dbda60a09f4..7d9a5d08eb59 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -418,7 +418,10 @@ struct mlx5_core_health {
 	u32				prev;
 	int				miss_counter;
 	bool				sick;
+	/* wq spinlock to synchronize draining */
+	spinlock_t			wq_lock;
 	struct workqueue_struct	       *wq;
+	unsigned long			flags;
 	struct work_struct		work;
 };
 
@@ -778,6 +781,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
 void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size,
 			struct mlx5_buf *buf, int node);
 int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);
-- 
cgit v1.2.3


From 04c0c1ab38e95105d950db5b84e727637e149ce7 Mon Sep 17 00:00:00 2001
From: Mohamad Haj Yahia <mohamad@mellanox.com>
Date: Tue, 25 Oct 2016 18:36:34 +0300
Subject: net/mlx5: PCI error recovery health care simulation

In case that the kernel PCI error handlers are not called, we will
trigger our own recovery flow.

The health work will give priority to the kernel pci error handlers to
recover the PCI by waiting for a small period, if the pci error handlers
are not triggered the manual recovery flow will be executed.

We don't save pci state in case of manual recovery because it will ruin the
pci configuration space and we will lose dma sync.

Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver')
Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/driver.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7d9a5d08eb59..ecc451d89ccd 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -423,6 +423,7 @@ struct mlx5_core_health {
 	struct workqueue_struct	       *wq;
 	unsigned long			flags;
 	struct work_struct		work;
+	struct delayed_work		recover_work;
 };
 
 struct mlx5_cq_table {
-- 
cgit v1.2.3


From e934f684856393a0b2aecc7fdd8357a48b79c535 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <sthemmin@microsoft.com>
Date: Wed, 26 Oct 2016 08:30:29 -0700
Subject: Revert "hv_netvsc: report vmbus name in ethtool"

This reverts commit e3f74b841d48
("hv_netvsc: report vmbus name in ethtool")'
because of problem introduced by commit f9a56e5d6a0ba
("Drivers: hv: make VMBus bus ids persistent").
This changed the format of the vmbus name and this new format is too
long to fit in the bus_info field of ethtool.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/hyperv.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 6824556d37ed..cd184bdca58f 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1169,13 +1169,6 @@ int __must_check __vmbus_driver_register(struct hv_driver *hv_driver,
 					 const char *mod_name);
 void vmbus_driver_unregister(struct hv_driver *hv_driver);
 
-static inline const char *vmbus_dev_name(const struct hv_device *device_obj)
-{
-	const struct kobject *kobj = &device_obj->device.kobj;
-
-	return kobj->name;
-}
-
 void vmbus_hvsock_device_unregister(struct vmbus_channel *channel);
 
 int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj,
-- 
cgit v1.2.3


From 6f2e0d2c3bf0f8d322ab7516c57340c7189cca02 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Thu, 27 Oct 2016 16:27:20 +0300
Subject: net/mlx4: Fix firmware command timeout during interrupt test

Currently interrupt test that is part of ethtool selftest runs the
check over all interrupt vectors of the device.
In mlx4_en package part of interrupt vectors are uninitialized since
mlx4_ib doesn't exist. This causes NOP FW command to time out.
Change logic to test current port interrupt vectors only.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/device.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index f6a164297358..3be7abd6e722 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1399,7 +1399,8 @@ void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr,
 		    u32 *lkey, u32 *rkey);
 int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr);
 int mlx4_SYNC_TPT(struct mlx4_dev *dev);
-int mlx4_test_interrupts(struct mlx4_dev *dev);
+int mlx4_test_interrupt(struct mlx4_dev *dev, int vector);
+int mlx4_test_async(struct mlx4_dev *dev);
 int mlx4_query_diag_counters(struct mlx4_dev *dev, u8 op_modifier,
 			     const u32 offset[], u32 value[],
 			     size_t array_len, u8 port);
-- 
cgit v1.2.3