From ae7871be189cb41184f1e05742b4a99e2c59774d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 16 Dec 2016 14:28:41 +0100
Subject: swiotlb: Convert swiotlb_force from int to enum

Convert the flag swiotlb_force from an int to an enum, to prepare for
the advent of more possible values.

Suggested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 183f37c8a5e1..71d104e4c849 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -9,7 +9,12 @@ struct device;
 struct page;
 struct scatterlist;
 
-extern int swiotlb_force;
+enum swiotlb_force {
+	SWIOTLB_NORMAL,		/* Default - depending on HW DMA mask etc. */
+	SWIOTLB_FORCE,		/* swiotlb=force */
+};
+
+extern enum swiotlb_force swiotlb_force;
 
 /*
  * Maximum allowable number of contiguous slabs to map,
-- 
cgit v1.2.3


From fff5d99225107f5f13fe4a9805adc2a1c4b5fb00 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 16 Dec 2016 14:28:42 +0100
Subject: swiotlb: Add swiotlb=noforce debug option

On architectures like arm64, swiotlb is tied intimately to the core
architecture DMA support. In addition, ZONE_DMA cannot be disabled.

To aid debugging and catch devices not supporting DMA to memory outside
the 32-bit address space, add a kernel command line option
"swiotlb=noforce", which disables the use of bounce buffers.
If specified, trying to map memory that cannot be used with DMA will
fail, and a rate-limited warning will be printed.

Note that io_tlb_nslabs is set to 1, which is the minimal supported
value.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 include/linux/swiotlb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 71d104e4c849..d9c84a9cde3d 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -12,6 +12,7 @@ struct scatterlist;
 enum swiotlb_force {
 	SWIOTLB_NORMAL,		/* Default - depending on HW DMA mask etc. */
 	SWIOTLB_FORCE,		/* swiotlb=force */
+	SWIOTLB_NO_FORCE,	/* swiotlb=noforce */
 };
 
 extern enum swiotlb_force swiotlb_force;
-- 
cgit v1.2.3


From 72c5296f9d64d8f5f27c2133e5f108a45a353d71 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 20 Dec 2016 14:38:14 -0700
Subject: genhd: remove dead and duplicated scsi code

blk_scsi_cmd_filter use was deprecated by 4beab5c6 and the SCSI macros
are duplicated in blkdev.h, both likely reintroduced by a bad merge from
540eed56.

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/genhd.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e0341af6950e..76f39754e7b0 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -146,15 +146,6 @@ enum {
 	DISK_EVENT_EJECT_REQUEST		= 1 << 1, /* eject requested */
 };
 
-#define BLK_SCSI_MAX_CMDS	(256)
-#define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
-
-struct blk_scsi_cmd_filter {
-	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
-	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
-	struct kobject kobj;
-};
-
 struct disk_part_tbl {
 	struct rcu_head rcu_head;
 	int len;
-- 
cgit v1.2.3


From e3ba730702af370563f66cb610b71aa0ca67955e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 22 Dec 2016 10:15:20 +0100
Subject: fsnotify: Remove fsnotify_duplicate_mark()

There are only two calls sites of fsnotify_duplicate_mark(). Those are
in kernel/audit_tree.c and both are bogus. Vfsmount pointer is unused
for audit tree, inode pointer and group gets set in
fsnotify_add_mark_locked() later anyway, mask and free_mark are already
set in alloc_chunk(). In fact, calling fsnotify_duplicate_mark() is
actively harmful because following fsnotify_add_mark_locked() will leak
group reference by overwriting the group pointer. So just remove the two
calls to fsnotify_duplicate_mark() and the function.

Signed-off-by: Jan Kara <jack@suse.cz>
[PM: line wrapping to fit in 80 chars]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/fsnotify_backend.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7268ed076be8..ce77caa2bb10 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -324,8 +324,6 @@ extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(str
 extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode);
 /* find (and take a reference) to a mark associated with group and vfsmount */
 extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt);
-/* copy the values from old into new */
-extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old);
 /* set the ignored_mask of a mark */
 extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask);
 /* set the mask of a mark (might pin the object into memory */
-- 
cgit v1.2.3


From 1efbd205b3cc5882a8c386c58a57134044e9d5ba Mon Sep 17 00:00:00 2001
From: Gal Pressman <galp@mellanox.com>
Date: Wed, 28 Dec 2016 14:58:39 +0200
Subject: Revert "net/mlx5: Add MPCNT register infrastructure"

This reverts commit 7f503169cabd70c1f13b9279c50eca7dfb9a7d51.

Fixes: 7f503169cabd ("net/mlx5: Add MPCNT register infrastructure")
Signed-off-by: Gal Pressman <galp@mellanox.com>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx5/device.h   |  5 ---
 include/linux/mlx5/driver.h   |  1 -
 include/linux/mlx5/mlx5_ifc.h | 93 -------------------------------------------
 3 files changed, 99 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 9f489365b3d3..52b437431c6a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1071,11 +1071,6 @@ enum {
 	MLX5_INFINIBAND_PORT_COUNTERS_GROUP   = 0x20,
 };
 
-enum {
-	MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP       = 0x0,
-	MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP = 0x2,
-};
-
 static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz)
 {
 	if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE)
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 0ae55361e674..735b36335f29 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -123,7 +123,6 @@ enum {
 	MLX5_REG_HOST_ENDIANNESS = 0x7004,
 	MLX5_REG_MCIA		 = 0x9014,
 	MLX5_REG_MLCR		 = 0x902b,
-	MLX5_REG_MPCNT		 = 0x9051,
 };
 
 enum mlx5_dcbx_oper_mode {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 57bec544e20a..a852e9db6f0d 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1757,80 +1757,6 @@ struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits {
 	u8         reserved_at_4c0[0x300];
 };
 
-struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits {
-	u8         life_time_counter_high[0x20];
-
-	u8         life_time_counter_low[0x20];
-
-	u8         rx_errors[0x20];
-
-	u8         tx_errors[0x20];
-
-	u8         l0_to_recovery_eieos[0x20];
-
-	u8         l0_to_recovery_ts[0x20];
-
-	u8         l0_to_recovery_framing[0x20];
-
-	u8         l0_to_recovery_retrain[0x20];
-
-	u8         crc_error_dllp[0x20];
-
-	u8         crc_error_tlp[0x20];
-
-	u8         reserved_at_140[0x680];
-};
-
-struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits {
-	u8         life_time_counter_high[0x20];
-
-	u8         life_time_counter_low[0x20];
-
-	u8         time_to_boot_image_start[0x20];
-
-	u8         time_to_link_image[0x20];
-
-	u8         calibration_time[0x20];
-
-	u8         time_to_first_perst[0x20];
-
-	u8         time_to_detect_state[0x20];
-
-	u8         time_to_l0[0x20];
-
-	u8         time_to_crs_en[0x20];
-
-	u8         time_to_plastic_image_start[0x20];
-
-	u8         time_to_iron_image_start[0x20];
-
-	u8         perst_handler[0x20];
-
-	u8         times_in_l1[0x20];
-
-	u8         times_in_l23[0x20];
-
-	u8         dl_down[0x20];
-
-	u8         config_cycle1usec[0x20];
-
-	u8         config_cycle2to7usec[0x20];
-
-	u8         config_cycle_8to15usec[0x20];
-
-	u8         config_cycle_16_to_63usec[0x20];
-
-	u8         config_cycle_64usec[0x20];
-
-	u8         correctable_err_msg_sent[0x20];
-
-	u8         non_fatal_err_msg_sent[0x20];
-
-	u8         fatal_err_msg_sent[0x20];
-
-	u8         reserved_at_2e0[0x4e0];
-};
-
 struct mlx5_ifc_cmd_inter_comp_event_bits {
 	u8         command_completion_vector[0x20];
 
@@ -2995,12 +2921,6 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits {
 	u8         reserved_at_0[0x7c0];
 };
 
-union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits {
-	struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits pcie_perf_cntrs_grp_data_layout;
-	struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits pcie_tas_cntrs_grp_data_layout;
-	u8         reserved_at_0[0x7c0];
-};
-
 union mlx5_ifc_event_auto_bits {
 	struct mlx5_ifc_comp_event_bits comp_event;
 	struct mlx5_ifc_dct_events_bits dct_events;
@@ -7320,18 +7240,6 @@ struct mlx5_ifc_ppcnt_reg_bits {
 	union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
 };
 
-struct mlx5_ifc_mpcnt_reg_bits {
-	u8         reserved_at_0[0x8];
-	u8         pcie_index[0x8];
-	u8         reserved_at_10[0xa];
-	u8         grp[0x6];
-
-	u8         clr[0x1];
-	u8         reserved_at_21[0x1f];
-
-	union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits counter_set;
-};
-
 struct mlx5_ifc_ppad_reg_bits {
 	u8         reserved_at_0[0x3];
 	u8         single_mac[0x1];
@@ -7937,7 +7845,6 @@ union mlx5_ifc_ports_control_registers_document_bits {
 	struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
 	struct mlx5_ifc_ppad_reg_bits ppad_reg;
 	struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
-	struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
 	struct mlx5_ifc_pplm_reg_bits pplm_reg;
 	struct mlx5_ifc_pplr_reg_bits pplr_reg;
 	struct mlx5_ifc_ppsc_reg_bits ppsc_reg;
-- 
cgit v1.2.3


From 10b1c04e92229ebeb38ccd0dcf2b6d3ec73c0575 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Thu, 29 Dec 2016 18:37:13 +0200
Subject: net/mlx4_core: Fix raw qp flow steering rules under SRIOV

Demoting simple flow steering rule priority (for DPDK) was achieved by
wrapping FW commands MLX4_QP_FLOW_STEERING_ATTACH/DETACH for the PF
as well, and forcing the priority to MLX4_DOMAIN_NIC in the wrapper
function for the PF and all VFs.

In function mlx4_ib_create_flow(), this change caused the main rule
creation for the PF to be wrapped, while it left the associated
tunnel steering rule creation unwrapped for the PF.

This mismatch caused rule deletion failures in mlx4_ib_destroy_flow()
for the PF when the detach wrapper function did not find the associated
tunnel-steering rule (since creation of that rule for the PF did not
go through the wrapper function).

Fix this by setting MLX4_QP_FLOW_STEERING_ATTACH/DETACH to be "native"
(so that the PF invocation does not go through the wrapper), and perform
the required priority demotion for the PF in the mlx4_ib_create_flow()
code path.

Fixes: 48564135cba8 ("net/mlx4_core: Demote simple multicast and broadcast flow steering rules")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mlx4/device.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 93bdb3485192..6533c16e27ad 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1384,6 +1384,8 @@ int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val);
 int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv);
 int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port,
 				      bool *vlan_offload_disabled);
+void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl,
+				       struct _rule_hw *eth_header);
 int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx);
 int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx);
 int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
-- 
cgit v1.2.3


From a0c10687ec9506b5e14fe3dd47832a77f2f2500c Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Fri, 30 Dec 2016 03:21:38 -0800
Subject: Revert "remoteproc: Merge table_ptr and cached_table pointers"

Following any fw_rsc_vdev entries in the resource table are two variable
length arrays, the first one reference vring resources and the second
one is the virtio config space.  The virtio config space is used by
virtio to communicate status and configuration changes and must as such
be shared with the remote.

The reverted commit incorrectly made any changes to the virtio config
space only affect the local copy, in an attempt to allowing memory
protection of the shared resource table.

This reverts commit cda8529346935fc86f476999ac4fbfe4e17abf11.
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 include/linux/remoteproc.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index e2f3a3281d8f..8265d351c9f0 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -408,7 +408,8 @@ enum rproc_crash_type {
  * @crash_comp: completion used to sync crash handler and the rproc reload
  * @recovery_disabled: flag that state if recovery was disabled
  * @max_notifyid: largest allocated notify id.
- * @table_ptr: our copy of the resource table
+ * @table_ptr: pointer to the resource table in effect
+ * @cached_table: copy of the resource table
  * @has_iommu: flag to indicate if remote processor is behind an MMU
  */
 struct rproc {
@@ -440,6 +441,7 @@ struct rproc {
 	bool recovery_disabled;
 	int max_notifyid;
 	struct resource_table *table_ptr;
+	struct resource_table *cached_table;
 	bool has_iommu;
 	bool auto_boot;
 };
-- 
cgit v1.2.3


From 42930553a7c11f06351bc08b889808d0f6020f08 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 30 Dec 2016 08:13:38 -0700
Subject: vfio-mdev: de-polute the namespace, rename parent_device & parent_ops

Add an mdev_ prefix so we're not poluting the namespace so much.

Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Jike Song <jike.song@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
---
 include/linux/mdev.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index ec819e9a115a..853bb78e5866 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -14,9 +14,9 @@
 #define MDEV_H
 
 /* Parent device */
-struct parent_device {
-	struct device		*dev;
-	const struct parent_ops	*ops;
+struct mdev_parent {
+	struct device			*dev;
+	const struct mdev_parent_ops	*ops;
 
 	/* internal */
 	struct kref		ref;
@@ -29,7 +29,7 @@ struct parent_device {
 /* Mediated device */
 struct mdev_device {
 	struct device		dev;
-	struct parent_device	*parent;
+	struct mdev_parent	*parent;
 	uuid_le			uuid;
 	void			*driver_data;
 
@@ -40,7 +40,7 @@ struct mdev_device {
 };
 
 /**
- * struct parent_ops - Structure to be registered for each parent device to
+ * struct mdev_parent_ops - Structure to be registered for each parent device to
  * register the device to mdev module.
  *
  * @owner:		The module owner.
@@ -86,10 +86,10 @@ struct mdev_device {
  *			@mdev: mediated device structure
  *			@vma: vma structure
  * Parent device that support mediated device should be registered with mdev
- * module with parent_ops structure.
+ * module with mdev_parent_ops structure.
  **/
 
-struct parent_ops {
+struct mdev_parent_ops {
 	struct module   *owner;
 	const struct attribute_group **dev_attr_groups;
 	const struct attribute_group **mdev_attr_groups;
@@ -159,7 +159,7 @@ extern struct bus_type mdev_bus_type;
 #define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
 
 extern int  mdev_register_device(struct device *dev,
-				 const struct parent_ops *ops);
+				 const struct mdev_parent_ops *ops);
 extern void mdev_unregister_device(struct device *dev);
 
 extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
-- 
cgit v1.2.3


From 9372e6feaafb65d88f667ffb5b7b425f8568344f Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 30 Dec 2016 08:13:41 -0700
Subject: vfio-mdev: Make mdev_parent private

Rather than hoping for good behavior by marking some elements
internal, enforce it by making the entire structure private and
creating an accessor function for the one useful external field.

Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Cc: Jike Song <jike.song@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
---
 include/linux/mdev.h | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 853bb78e5866..f586222b6c25 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -13,19 +13,6 @@
 #ifndef MDEV_H
 #define MDEV_H
 
-/* Parent device */
-struct mdev_parent {
-	struct device			*dev;
-	const struct mdev_parent_ops	*ops;
-
-	/* internal */
-	struct kref		ref;
-	struct mutex		lock;
-	struct list_head	next;
-	struct kset		*mdev_types_kset;
-	struct list_head	type_list;
-};
-
 /* Mediated device */
 struct mdev_device {
 	struct device		dev;
@@ -165,4 +152,6 @@ extern void mdev_unregister_device(struct device *dev);
 extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
 extern void mdev_unregister_driver(struct mdev_driver *drv);
 
+extern struct device *mdev_parent_dev(struct mdev_device *mdev);
+
 #endif /* MDEV_H */
-- 
cgit v1.2.3


From 99e3123e3d72616a829dad6d25aa005ef1ef9b13 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 30 Dec 2016 08:13:44 -0700
Subject: vfio-mdev: Make mdev_device private and abstract interfaces

Abstract access to mdev_device so that we can define which interfaces
are public rather than relying on comments in the structure.

Cc: Zhenyu Wang <zhenyuw@linux.intel.com>
Cc: Zhi Wang <zhi.a.wang@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Jike Song <jike.song@intel.com>
Reviewed by: Kirti Wankhede <kwankhede@nvidia.com>
---
 include/linux/mdev.h | 31 ++++++-------------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index f586222b6c25..3ee44b8d2bb3 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -13,18 +13,7 @@
 #ifndef MDEV_H
 #define MDEV_H
 
-/* Mediated device */
-struct mdev_device {
-	struct device		dev;
-	struct mdev_parent	*parent;
-	uuid_le			uuid;
-	void			*driver_data;
-
-	/* internal */
-	struct kref		ref;
-	struct list_head	next;
-	struct kobject		*type_kobj;
-};
+struct mdev_device;
 
 /**
  * struct mdev_parent_ops - Structure to be registered for each parent device to
@@ -75,7 +64,6 @@ struct mdev_device {
  * Parent device that support mediated device should be registered with mdev
  * module with mdev_parent_ops structure.
  **/
-
 struct mdev_parent_ops {
 	struct module   *owner;
 	const struct attribute_group **dev_attr_groups;
@@ -129,22 +117,13 @@ struct mdev_driver {
 };
 
 #define to_mdev_driver(drv)	container_of(drv, struct mdev_driver, driver)
-#define to_mdev_device(dev)	container_of(dev, struct mdev_device, dev)
 
-static inline void *mdev_get_drvdata(struct mdev_device *mdev)
-{
-	return mdev->driver_data;
-}
-
-static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data)
-{
-	mdev->driver_data = data;
-}
+extern void *mdev_get_drvdata(struct mdev_device *mdev);
+extern void mdev_set_drvdata(struct mdev_device *mdev, void *data);
+extern uuid_le mdev_uuid(struct mdev_device *mdev);
 
 extern struct bus_type mdev_bus_type;
 
-#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
-
 extern int  mdev_register_device(struct device *dev,
 				 const struct mdev_parent_ops *ops);
 extern void mdev_unregister_device(struct device *dev);
@@ -153,5 +132,7 @@ extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
 extern void mdev_unregister_driver(struct mdev_driver *drv);
 
 extern struct device *mdev_parent_dev(struct mdev_device *mdev);
+extern struct device *mdev_dev(struct mdev_device *mdev);
+extern struct mdev_device *mdev_from_dev(struct device *dev);
 
 #endif /* MDEV_H */
-- 
cgit v1.2.3


From 65e4345c8ef8811bbb4860fe5f2df10646b7f2e1 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 30 Dec 2016 23:54:18 +0100
Subject: iio: accel: st_accel: fix LIS3LV02 reading and scaling

The LIS3LV02 has a special bit that need to be set to get the
read values left aligned. Before this patch we get gibberish
like this:

iio_generic_buffer -a -c10 -n lis3lv02dl_accel
(...)
0.000000 -0.010042 -0.642688 19155832931907
0.000000 -0.010042 -0.642688 19155858751073

Which is because we read a raw value for 1g as 64 which is
the nominal 1024 for 1g shifted 4 bits to the left by being
right-aligned rather than left aligned.

Since all other sensors are left aligned, add some code to
set the special DAS (data alignment setting) bit to 1 so that
the right value is now read like this:

iio_generic_buffer -a -c10 -n lis3lv02dl_accel
(...)
0.000000 -0.147095 -10.120135 24761614364956
-0.029419 -0.176514 -10.120135 24761631624540

The scaling was weird as well: we have a gain of 1000 for 1g
and 3000 for 6g. I don't even remember how I came up with the
old values but they are wrong.

Fixes: 3acddf74f807 ("iio: st-sensors: add support for lis3lv02d accelerometer")
Cc: Lorenzo Bianconi <lorenzo.bianconi@st.com>
Cc: Giuseppe Barba <giuseppe.barba@st.com>
Cc: Denis Ciocca <denis.ciocca@st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/common/st_sensors.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 228bd44efa4c..497f2b3a5a62 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -115,6 +115,16 @@ struct st_sensor_bdu {
 	u8 mask;
 };
 
+/**
+ * struct st_sensor_das - ST sensor device data alignment selection
+ * @addr: address of the register.
+ * @mask: mask to write the das flag for left alignment.
+ */
+struct st_sensor_das {
+	u8 addr;
+	u8 mask;
+};
+
 /**
  * struct st_sensor_data_ready_irq - ST sensor device data-ready interrupt
  * @addr: address of the register.
@@ -185,6 +195,7 @@ struct st_sensor_transfer_function {
  * @enable_axis: Enable one or more axis of the sensor.
  * @fs: Full scale register and full scale list available.
  * @bdu: Block data update register.
+ * @das: Data Alignment Selection register.
  * @drdy_irq: Data ready register of the sensor.
  * @multi_read_bit: Use or not particular bit for [I2C/SPI] multi-read.
  * @bootime: samples to discard when sensor passing from power-down to power-up.
@@ -200,6 +211,7 @@ struct st_sensor_settings {
 	struct st_sensor_axis enable_axis;
 	struct st_sensor_fullscale fs;
 	struct st_sensor_bdu bdu;
+	struct st_sensor_das das;
 	struct st_sensor_data_ready_irq drdy_irq;
 	bool multi_read_bit;
 	unsigned int bootime;
-- 
cgit v1.2.3


From c6ef7fd40eddad38a8825cbd6bb2ce8bdbba88f5 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 4 Jan 2017 15:08:15 -0500
Subject: vfio-mdev: fix non-standard ioctl return val causing i386 build fail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

What appears to be a copy and paste error from the line above gets
the ioctl a ssize_t return value instead of the traditional "int".

The associated sample code used "long" which meant it would compile
for x86-64 but not i386, with the latter failing as follows:

  CC [M]  samples/vfio-mdev/mtty.o
samples/vfio-mdev/mtty.c:1418:20: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types]
  .ioctl          = mtty_ioctl,
                    ^
samples/vfio-mdev/mtty.c:1418:20: note: (near initialization for ‘mdev_fops.ioctl’)
cc1: some warnings being treated as errors

Since in this case, vfio is working with struct file_operations; as such:

    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    long (*compat_ioctl) (struct file *, unsigned int, unsigned long);

...and so here we just standardize on long vs. the normal int that user
space typically sees and documents as per "man ioctl" and similar.

Fixes: 9d1a546c53b4 ("docs: Sample driver to demonstrate how to use Mediated device framework.")
Cc: Kirti Wankhede <kwankhede@nvidia.com>
Cc: Neo Jia <cjia@nvidia.com>
Cc: kvm@vger.kernel.org
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/mdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mdev.h b/include/linux/mdev.h
index 3ee44b8d2bb3..b6e048e1045f 100644
--- a/include/linux/mdev.h
+++ b/include/linux/mdev.h
@@ -78,7 +78,7 @@ struct mdev_parent_ops {
 			size_t count, loff_t *ppos);
 	ssize_t (*write)(struct mdev_device *mdev, const char __user *buf,
 			 size_t count, loff_t *ppos);
-	ssize_t (*ioctl)(struct mdev_device *mdev, unsigned int cmd,
+	long	(*ioctl)(struct mdev_device *mdev, unsigned int cmd,
 			 unsigned long arg);
 	int	(*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma);
 };
-- 
cgit v1.2.3


From 7453c549f5f6485c0d79cad7844870dcc7d1b34d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 20 Dec 2016 10:02:02 -0500
Subject: swiotlb: Export swiotlb_max_segment to users

So they can figure out what is the optimal number of pages
that can be contingously stitched together without fear of
bounce buffer.

We also expose an mechanism for sub-users of SWIOTLB API, such
as Xen-SWIOTLB to set the max segment value. And lastly
if swiotlb=force is set (which mandates we bounce buffer everything)
we set max_segment so at least we can bounce buffer one 4K page
instead of a giant 512KB one for which we may not have space.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reported-and-Tested-by: Juergen Gross <jgross@suse.com>
---
 include/linux/swiotlb.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d9c84a9cde3d..4ee479f2f355 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -114,11 +114,14 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask);
 
 #ifdef CONFIG_SWIOTLB
 extern void __init swiotlb_free(void);
+unsigned int swiotlb_max_segment(void);
 #else
 static inline void swiotlb_free(void) { }
+static inline unsigned int swiotlb_max_segment(void) { return 0; }
 #endif
 
 extern void swiotlb_print_info(void);
 extern int is_swiotlb_buffer(phys_addr_t paddr);
+extern void swiotlb_set_max_segment(unsigned int);
 
 #endif /* __LINUX_SWIOTLB_H */
-- 
cgit v1.2.3


From 20b1e22d01a4b0b11d3a1066e9feb04be38607ec Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Thu, 5 Jan 2017 13:51:29 +0100
Subject: x86/efi: Don't allocate memmap through memblock after mm_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the following commit:

  4bc9f92e64c8 ("x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data")

...  efi_bgrt_init() calls into the memblock allocator through
efi_mem_reserve() => efi_arch_mem_reserve() *after* mm_init() has been called.

Indeed, KASAN reports a bad read access later on in efi_free_boot_services():

  BUG: KASAN: use-after-free in efi_free_boot_services+0xae/0x24c
            at addr ffff88022de12740
  Read of size 4 by task swapper/0/0
  page:ffffea0008b78480 count:0 mapcount:-127
  mapping:          (null) index:0x1 flags: 0x5fff8000000000()
  [...]
  Call Trace:
   dump_stack+0x68/0x9f
   kasan_report_error+0x4c8/0x500
   kasan_report+0x58/0x60
   __asan_load4+0x61/0x80
   efi_free_boot_services+0xae/0x24c
   start_kernel+0x527/0x562
   x86_64_start_reservations+0x24/0x26
   x86_64_start_kernel+0x157/0x17a
   start_cpu+0x5/0x14

The instruction at the given address is the first read from the memmap's
memory, i.e. the read of md->type in efi_free_boot_services().

Note that the writes earlier in efi_arch_mem_reserve() don't splat because
they're done through early_memremap()ed addresses.

So, after memblock is gone, allocations should be done through the "normal"
page allocator. Introduce a helper, efi_memmap_alloc() for this. Use
it from efi_arch_mem_reserve(), efi_free_boot_services() and, for the sake
of consistency, from efi_fake_memmap() as well.

Note that for the latter, the memmap allocations cease to be page aligned.
This isn't needed though.

Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: <stable@vger.kernel.org> # v4.9
Cc: Dave Young <dyoung@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mika Penttilä <mika.penttila@nextfour.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Fixes: 4bc9f92e64c8 ("x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data")
Link: http://lkml.kernel.org/r/20170105125130.2815-1-nicstange@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index a07a476178cd..0c5420208c40 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -950,6 +950,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes,
 #endif
 extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 
+extern phys_addr_t __init efi_memmap_alloc(unsigned int num_entries);
 extern int __init efi_memmap_init_early(struct efi_memory_map_data *data);
 extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size);
 extern void __init efi_memmap_unmap(void);
-- 
cgit v1.2.3


From ea07b862ac8ef9b8c8358517d2e39f847dda6659 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 6 Jan 2017 19:21:43 -0500
Subject: mm: workingset: fix use-after-free in shadow node shrinker

Several people report seeing warnings about inconsistent radix tree
nodes followed by crashes in the workingset code, which all looked like
use-after-free access from the shadow node shrinker.

Dave Jones managed to reproduce the issue with a debug patch applied,
which confirmed that the radix tree shrinking indeed frees shadow nodes
while they are still linked to the shadow LRU:

  WARNING: CPU: 2 PID: 53 at lib/radix-tree.c:643 delete_node+0x1e4/0x200
  CPU: 2 PID: 53 Comm: kswapd0 Not tainted 4.10.0-rc2-think+ #3
  Call Trace:
     delete_node+0x1e4/0x200
     __radix_tree_delete_node+0xd/0x10
     shadow_lru_isolate+0xe6/0x220
     __list_lru_walk_one.isra.4+0x9b/0x190
     list_lru_walk_one+0x23/0x30
     scan_shadow_nodes+0x2e/0x40
     shrink_slab.part.44+0x23d/0x5d0
     shrink_node+0x22c/0x330
     kswapd+0x392/0x8f0

This is the WARN_ON_ONCE(!list_empty(&node->private_list)) placed in the
inlined radix_tree_shrink().

The problem is with 14b468791fa9 ("mm: workingset: move shadow entry
tracking to radix tree exceptional tracking"), which passes an update
callback into the radix tree to link and unlink shadow leaf nodes when
tree entries change, but forgot to pass the callback when reclaiming a
shadow node.

While the reclaimed shadow node itself is unlinked by the shrinker, its
deletion from the tree can cause the left-most leaf node in the tree to
be shrunk.  If that happens to be a shadow node as well, we don't unlink
it from the LRU as we should.

Consider this tree, where the s are shadow entries:

       root->rnode
            |
       [0       n]
        |       |
     [s    ] [sssss]

Now the shadow node shrinker reclaims the rightmost leaf node through
the shadow node LRU:

       root->rnode
            |
       [0        ]
        |
    [s     ]

Because the parent of the deleted node is the first level below the
root and has only one child in the left-most slot, the intermediate
level is shrunk and the node containing the single shadow is put in
its place:

       root->rnode
            |
       [s        ]

The shrinker again sees a single left-most slot in a first level node
and thus decides to store the shadow in root->rnode directly and free
the node - which is a leaf node on the shadow node LRU.

  root->rnode
       |
       s

Without the update callback, the freed node remains on the shadow LRU,
where it causes later shrinker runs to crash.

Pass the node updater callback into __radix_tree_delete_node() in case
the deletion causes the left-most branch in the tree to collapse too.

Also add warnings when linked nodes are freed right away, rather than
wait for the use-after-free when the list is scanned much later.

Fixes: 14b468791fa9 ("mm: workingset: move shadow entry tracking to radix tree exceptional tracking")
Reported-by: Dave Chinner <david@fromorbit.com>
Reported-by: Hugh Dickins <hughd@google.com>
Reported-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-and-tested-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Leech <cleech@redhat.com>
Cc: Lee Duncan <lduncan@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <mawilcox@linuxonhyperv.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 5dea8f6440e4..52bda854593b 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -306,7 +306,9 @@ void radix_tree_iter_replace(struct radix_tree_root *,
 void radix_tree_replace_slot(struct radix_tree_root *root,
 			     void **slot, void *item);
 void __radix_tree_delete_node(struct radix_tree_root *root,
-			      struct radix_tree_node *node);
+			      struct radix_tree_node *node,
+			      radix_tree_update_node_t update_node,
+			      void *private);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 void radix_tree_clear_tags(struct radix_tree_root *root,
-- 
cgit v1.2.3


From 57ea52a865144aedbcd619ee0081155e658b6f7d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Tue, 10 Jan 2017 12:24:15 -0800
Subject: gro: Disable frag0 optimization on IPv6 ext headers

The GRO fast path caches the frag0 address.  This address becomes
invalid if frag0 is modified by pskb_may_pull or its variants.
So whenever that happens we must disable the frag0 optimization.

This is usually done through the combination of gro_header_hard
and gro_header_slow, however, the IPv6 extension header path did
the pulling directly and would continue to use the GRO fast path
incorrectly.

This patch fixes it by disabling the fast path when we enter the
IPv6 extension header path.

Fixes: 78a478d0efd9 ("gro: Inline skb_gro_header and cache frag0 virtual address")
Reported-by: Slava Shwartsman <slavash@mellanox.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 994f7423a74b..9bde9558b596 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2477,14 +2477,19 @@ static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen)
 	return NAPI_GRO_CB(skb)->frag0_len < hlen;
 }
 
+static inline void skb_gro_frag0_invalidate(struct sk_buff *skb)
+{
+	NAPI_GRO_CB(skb)->frag0 = NULL;
+	NAPI_GRO_CB(skb)->frag0_len = 0;
+}
+
 static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen,
 					unsigned int offset)
 {
 	if (!pskb_may_pull(skb, hlen))
 		return NULL;
 
-	NAPI_GRO_CB(skb)->frag0 = NULL;
-	NAPI_GRO_CB(skb)->frag0_len = 0;
+	skb_gro_frag0_invalidate(skb);
 	return skb->data + offset;
 }
 
-- 
cgit v1.2.3


From 097963959594c5eccaba42510f7033f703211bda Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 10 Jan 2017 16:57:21 -0800
Subject: mm: add follow_pte_pmd()

Patch series "Write protect DAX PMDs in *sync path".

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pmd_t of a DAX PMD entry during an *sync operation.  This can result
in data loss, as detailed in patch 2.

This series is based on Dan's "libnvdimm-pending" branch, which is the
current home for Jan's "dax: Page invalidation fixes" series.  You can
find a working tree here:

  https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean

This patch (of 2):

Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a
huge page PMD leaf to be found and returned.

Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fe6b4036664a..02793ac64ac6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1212,6 +1212,8 @@ void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
 int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
 	       spinlock_t **ptlp);
+int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
-- 
cgit v1.2.3


From f729c8c9b24f0540a6e6b86e68f3888ba90ef7e7 Mon Sep 17 00:00:00 2001
From: Ross Zwisler <ross.zwisler@linux.intel.com>
Date: Tue, 10 Jan 2017 16:57:24 -0800
Subject: dax: wrprotect pmd_t in dax_mapping_entry_mkclean

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pmd_t of a DAX PMD entry during an *sync operation.  This can result
in data loss in the following sequence:

1) mmap write to DAX PMD, dirtying PMD radix tree entry and making the
   pmd_t dirty and writeable
2) fsync, flushing out PMD data and cleaning the radix tree entry. We
   currently fail to mark the pmd_t as clean and write protected.
3) more mmap writes to the PMD.  These don't cause any page faults since
   the pmd_t is dirty and writeable.  The radix tree entry remains clean.
4) fsync, which fails to flush the dirty PMD data because the radix tree
   entry was clean.
5) crash - dirty data that should have been fsync'd as part of 4) could
   still have been in the processor cache, and is lost.

Fix this by marking the pmd_t clean and write protected in
dax_mapping_entry_mkclean(), which is called as part of the fsync
operation 2).  This will cause the writes in step 3) above to generate
page faults where we'll re-dirty the PMD radix tree entry, resulting in
flushes in the fsync that happens in step 4).

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Link: http://lkml.kernel.org/r/1482272586-21177-3-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 02793ac64ac6..b84615b0f64c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1210,8 +1210,6 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma);
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
-int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
-	       spinlock_t **ptlp);
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
-- 
cgit v1.2.3


From bb1107f7c6052c863692a41f78c000db792334bf Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 10 Jan 2017 16:57:27 -0800
Subject: mm, slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER

Andrey Konovalov has reported the following warning triggered by the
syzkaller fuzzer.

  WARNING: CPU: 1 PID: 9935 at mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20
  Kernel panic - not syncing: panic_on_warn set ...
  CPU: 1 PID: 9935 Comm: syz-executor0 Not tainted 4.9.0-rc7+ #34
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Call Trace:
    __alloc_pages_slowpath mm/page_alloc.c:3511
    __alloc_pages_nodemask+0x159c/0x1e20 mm/page_alloc.c:3781
    alloc_pages_current+0x1c7/0x6b0 mm/mempolicy.c:2072
    alloc_pages include/linux/gfp.h:469
    kmalloc_order+0x1f/0x70 mm/slab_common.c:1015
    kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026
    kmalloc_large include/linux/slab.h:422
    __kmalloc+0x210/0x2d0 mm/slub.c:3723
    kmalloc include/linux/slab.h:495
    ep_write_iter+0x167/0xb50 drivers/usb/gadget/legacy/inode.c:664
    new_sync_write fs/read_write.c:499
    __vfs_write+0x483/0x760 fs/read_write.c:512
    vfs_write+0x170/0x4e0 fs/read_write.c:560
    SYSC_write fs/read_write.c:607
    SyS_write+0xfb/0x230 fs/read_write.c:599
    entry_SYSCALL_64_fastpath+0x1f/0xc2

The issue is caused by a lack of size check for the request size in
ep_write_iter which should be fixed.  It, however, points to another
problem, that SLUB defines KMALLOC_MAX_SIZE too large because the its
KMALLOC_SHIFT_MAX is (MAX_ORDER + PAGE_SHIFT) which means that the
resulting page allocator request might be MAX_ORDER which is too large
(see __alloc_pages_slowpath).

The same applies to the SLOB allocator which allows even larger sizes.
Make sure that they are capped properly and never request more than
MAX_ORDER order.

Link: http://lkml.kernel.org/r/20161220130659.16461-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 084b12bad198..4c5363566815 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -226,7 +226,7 @@ static inline const char *__check_heap_object(const void *ptr,
  * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
-#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
@@ -239,7 +239,7 @@ static inline const char *__check_heap_object(const void *ptr,
  * be allocated from the same page.
  */
 #define KMALLOC_SHIFT_HIGH	PAGE_SHIFT
-#define KMALLOC_SHIFT_MAX	30
+#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT - 1)
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
-- 
cgit v1.2.3


From 41b6167e8f746b475668f1da78599fc4284f18db Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 10 Jan 2017 16:57:42 -0800
Subject: mm: get rid of __GFP_OTHER_NODE

The flag was introduced by commit 78afd5612deb ("mm: add
__GFP_OTHER_NODE flag") to allow proper accounting of remote node
allocations done by kernel daemons on behalf of a process - e.g.
khugepaged.

After "mm: fix remote numa hits statistics" we do not need and actually
use the flag so we can safely remove it because all allocations which
are satisfied from their "home" node are accounted properly.

[mhocko@suse.com: fix build]
Link: http://lkml.kernel.org/r/20170106122225.GK5556@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/20170102153057.9451-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4175dca4ac39..7806a8f80abc 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -38,9 +38,8 @@ struct vm_area_struct;
 #define ___GFP_ACCOUNT		0x100000u
 #define ___GFP_NOTRACK		0x200000u
 #define ___GFP_DIRECT_RECLAIM	0x400000u
-#define ___GFP_OTHER_NODE	0x800000u
-#define ___GFP_WRITE		0x1000000u
-#define ___GFP_KSWAPD_RECLAIM	0x2000000u
+#define ___GFP_WRITE		0x800000u
+#define ___GFP_KSWAPD_RECLAIM	0x1000000u
 /* If the above are modified, __GFP_BITS_SHIFT may need updating */
 
 /*
@@ -172,11 +171,6 @@ struct vm_area_struct;
  * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
  *   distinguishing in the source between false positives and allocations that
  *   cannot be supported (e.g. page tables).
- *
- * __GFP_OTHER_NODE is for allocations that are on a remote node but that
- *   should not be accounted for as a remote allocation in vmstat. A
- *   typical user would be khugepaged collapsing a huge page on a remote
- *   node.
  */
 #define __GFP_COLD	((__force gfp_t)___GFP_COLD)
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
@@ -184,10 +178,9 @@ struct vm_area_struct;
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
-#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT 26
+#define __GFP_BITS_SHIFT 25
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /*
-- 
cgit v1.2.3


From 2d39b3cd34e6d323720d4c61bd714f5ae202c022 Mon Sep 17 00:00:00 2001
From: Jamie Iles <jamie.iles@oracle.com>
Date: Tue, 10 Jan 2017 16:57:54 -0800
Subject: signal: protect SIGNAL_UNKILLABLE from unintentional clearing.

Since commit 00cd5c37afd5 ("ptrace: permit ptracing of /sbin/init") we
can now trace init processes.  init is initially protected with
SIGNAL_UNKILLABLE which will prevent fatal signals such as SIGSTOP, but
there are a number of paths during tracing where SIGNAL_UNKILLABLE can
be implicitly cleared.

This can result in init becoming stoppable/killable after tracing.  For
example, running:

  while true; do kill -STOP 1; done &
  strace -p 1

and then stopping strace and the kill loop will result in init being
left in state TASK_STOPPED.  Sending SIGCONT to init will resume it, but
init will now respond to future SIGSTOP signals rather than ignoring
them.

Make sure that when setting SIGNAL_STOP_CONTINUED/SIGNAL_STOP_STOPPED
that we don't clear SIGNAL_UNKILLABLE.

Link: http://lkml.kernel.org/r/20170104122017.25047-1-jamie.iles@oracle.com
Signed-off-by: Jamie Iles <jamie.iles@oracle.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4d1905245c7a..ad3ec9ec61f7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -854,6 +854,16 @@ struct signal_struct {
 
 #define SIGNAL_UNKILLABLE	0x00000040 /* for init: ignore fatal signals */
 
+#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
+			  SIGNAL_STOP_CONTINUED)
+
+static inline void signal_set_stop_flags(struct signal_struct *sig,
+					 unsigned int flags)
+{
+	WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP));
+	sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
+}
+
 /* If true, all threads except ->group_exit_task have pending SIGKILL */
 static inline int signal_group_exit(const struct signal_struct *sig)
 {
-- 
cgit v1.2.3


From b4536f0c829c8586544c94735c343f9b5070bd01 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 10 Jan 2017 16:58:04 -0800
Subject: mm, memcg: fix the active list aging for lowmem requests when memcg
 is enabled

Nils Holland and Klaus Ethgen have reported unexpected OOM killer
invocations with 32b kernel starting with 4.8 kernels

	kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0
	kworker/u4:5 cpuset=/ mems_allowed=0
	CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2
	[...]
	Mem-Info:
	active_anon:58685 inactive_anon:90 isolated_anon:0
	 active_file:274324 inactive_file:281962 isolated_file:0
	 unevictable:0 dirty:649 writeback:0 unstable:0
	 slab_reclaimable:40662 slab_unreclaimable:17754
	 mapped:7382 shmem:202 pagetables:351 bounce:0
	 free:206736 free_pcp:332 free_cma:0
	Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no
	DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
	lowmem_reserve[]: 0 813 3474 3474
	Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB
	lowmem_reserve[]: 0 0 21292 21292
	HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB

the oom killer is clearly pre-mature because there there is still a lot
of page cache in the zone Normal which should satisfy this lowmem
request.  Further debugging has shown that the reclaim cannot make any
forward progress because the page cache is hidden in the active list
which doesn't get rotated because inactive_list_is_low is not memcg
aware.

The code simply subtracts per-zone highmem counters from the respective
memcg's lru sizes which doesn't make any sense.  We can simply end up
always seeing the resulting active and inactive counts 0 and return
false.  This issue is not limited to 32b kernels but in practice the
effect on systems without CONFIG_HIGHMEM would be much harder to notice
because we do not invoke the OOM killer for allocations requests
targeting < ZONE_NORMAL.

Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node
and subtract per-memcg highmem counts when memcg is enabled.  Introduce
helper lruvec_zone_lru_size which redirects to either zone counters or
mem_cgroup_get_zone_lru_size when appropriate.

We are losing empty LRU but non-zero lru size detection introduced by
ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because
of the inherent zone vs. node discrepancy.

Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio")
Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Nils Holland <nholland@tisys.org>
Tested-by: Nils Holland <nholland@tisys.org>
Reported-by: Klaus Ethgen <Klaus@Ethgen.de>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: <stable@vger.kernel.org>	[4.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 26 +++++++++++++++++++++++---
 include/linux/mm_inline.h  |  2 +-
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 61d20c17f3b7..254698856b8f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter {
  */
 struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
-	unsigned long		lru_size[NR_LRU_LISTS];
+	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
 	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
 
@@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-		int nr_pages);
+		int zid, int nr_pages);
 
 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 					   int nid, unsigned int lru_mask);
@@ -441,9 +441,23 @@ static inline
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	struct mem_cgroup_per_node *mz;
+	unsigned long nr_pages = 0;
+	int zid;
 
 	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	return mz->lru_size[lru];
+	for (zid = 0; zid < MAX_NR_ZONES; zid++)
+		nr_pages += mz->lru_zone_size[zid][lru];
+	return nr_pages;
+}
+
+static inline
+unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
+		enum lru_list lru, int zone_idx)
+{
+	struct mem_cgroup_per_node *mz;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	return mz->lru_zone_size[zone_idx][lru];
 }
 
 void mem_cgroup_handle_over_high(void);
@@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	return 0;
 }
+static inline
+unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
+		enum lru_list lru, int zone_idx)
+{
+	return 0;
+}
 
 static inline unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 71613e8a720f..41d376e7116d 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 {
 	__update_lru_size(lruvec, lru, zid, nr_pages);
 #ifdef CONFIG_MEMCG
-	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
 #endif
 }
 
-- 
cgit v1.2.3


From 8c2dd3e4a4bae78093c4a5cee6494877651be3c9 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 10 Jan 2017 16:58:06 -0800
Subject: mm: rename __alloc_page_frag to page_frag_alloc and __free_page_frag
 to page_frag_free

Patch series "Page fragment updates", v4.

This patch series takes care of a few cleanups for the page fragments
API.

First we do some renames so that things are much more consistent.  First
we move the page_frag_ portion of the name to the front of the functions
names.  Secondly we split out the cache specific functions from the
other page fragment functions by adding the word "cache" to the name.

Finally I added a bit of documentation that will hopefully help to
explain some of this.  I plan to revisit this later as we get things
more ironed out in the near future with the changes planned for the DMA
setup to support eXpress Data Path.

This patch (of 3):

This patch renames the page frag functions to be more consistent with
other APIs.  Specifically we place the name page_frag first in the name
and then have either an alloc or free call name that we append as the
suffix.  This makes it a bit clearer in terms of naming.

In addition we drop the leading double underscores since we are
technically no longer a backing interface and instead the front end that
is called from the networking APIs.

Link: http://lkml.kernel.org/r/20170104023854.13451.67390.stgit@localhost.localdomain
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h    | 6 +++---
 include/linux/skbuff.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 7806a8f80abc..ed77a86fbbb0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -501,9 +501,9 @@ extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 struct page_frag_cache;
 extern void __page_frag_drain(struct page *page, unsigned int order,
 			      unsigned int count);
-extern void *__alloc_page_frag(struct page_frag_cache *nc,
-			       unsigned int fragsz, gfp_t gfp_mask);
-extern void __free_page_frag(void *addr);
+extern void *page_frag_alloc(struct page_frag_cache *nc,
+			     unsigned int fragsz, gfp_t gfp_mask);
+extern void page_frag_free(void *addr);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b53c0cfd417e..a410715bbef8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2480,7 +2480,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 
 static inline void skb_free_frag(void *addr)
 {
-	__free_page_frag(addr);
+	page_frag_free(addr);
 }
 
 void *napi_alloc_frag(unsigned int fragsz);
-- 
cgit v1.2.3


From 2976db8018532b624c4123ae662fbc0814877abf Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 10 Jan 2017 16:58:09 -0800
Subject: mm: rename __page_frag functions to __page_frag_cache, drop order
 from drain

This patch does two things.

First it goes through and renames the __page_frag prefixed functions to
__page_frag_cache so that we can be clear that we are draining or
refilling the cache, not the frags themselves.

Second we drop the order parameter from __page_frag_cache_drain since we
don't actually need to pass it since all fragments are either order 0 or
must be a compound page.

Link: http://lkml.kernel.org/r/20170104023954.13451.5678.stgit@localhost.localdomain
Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ed77a86fbbb0..0fe0b6295ab5 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -499,8 +499,7 @@ extern void free_hot_cold_page(struct page *page, bool cold);
 extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 
 struct page_frag_cache;
-extern void __page_frag_drain(struct page *page, unsigned int order,
-			      unsigned int count);
+extern void __page_frag_cache_drain(struct page *page, unsigned int count);
 extern void *page_frag_alloc(struct page_frag_cache *nc,
 			     unsigned int fragsz, gfp_t gfp_mask);
 extern void page_frag_free(void *addr);
-- 
cgit v1.2.3


From f05714293a591038304ddae7cb0dd747bb3786cc Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 10 Jan 2017 16:58:15 -0800
Subject: mm: support anonymous stable page

During developemnt for zram-swap asynchronous writeback, I found strange
corruption of compressed page, resulting in:

  Modules linked in: zram(E)
  CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G            E   4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
  task: ffff88007620b840 task.stack: ffff880078090000
  RIP: set_freeobj.part.43+0x1c/0x1f
  RSP: 0018:ffff880078093ca8  EFLAGS: 00010246
  RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8
  RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246
  RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000
  R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88
  R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001
  FS:  0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0
  Call Trace:
    obj_malloc+0x22b/0x260
    zs_malloc+0x1e4/0x580
    zram_bvec_rw+0x4cd/0x830 [zram]
    page_requests_rw+0x9c/0x130 [zram]
    zram_thread+0xe6/0x173 [zram]
    kthread+0xca/0xe0
    ret_from_fork+0x25/0x30

With investigation, it reveals currently stable page doesn't support
anonymous page.  IOW, reuse_swap_page can reuse the page without waiting
writeback completion so it can overwrite page zram is compressing.

Unfortunately, zram has used per-cpu stream feature from v4.7.
It aims for increasing cache hit ratio of scratch buffer for
compressing. Downside of that approach is that zram should ask
memory space for compressed page in per-cpu context which requires
stricted gfp flag which could be failed. If so, it retries to
allocate memory space out of per-cpu context so it could get memory
this time and compress the data again, copies it to the memory space.

In this scenario, zram assumes the data should never be changed
but it is not true unless stable page supports. So, If the data is
changed under us, zram can make buffer overrun because second
compression size could be bigger than one we got in previous trial
and blindly, copy bigger size object to smaller buffer which is
buffer overrun. The overrun breaks zsmalloc free object chaining
so system goes crash like above.

I think below is same problem.
https://bugzilla.suse.com/show_bug.cgi?id=997574

Unfortunately, reuse_swap_page should be atomic so that we cannot wait on
writeback in there so the approach in this patch is simply return false if
we found it needs stable page.  Although it increases memory footprint
temporarily, it happens rarely and it should be reclaimed easily althoug
it happened.  Also, It would be better than waiting of IO completion,
which is critial path for application latency.

Fixes: da9556a2367c ("zram: user per-cpu compression streams")
Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox
Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Hyeoncheol Lee <cheol.lee@lge.com>
Cc: <yjay.kim@lge.com>
Cc: Sangseok Lee <sangseok.lee@lge.com>
Cc: <stable@vger.kernel.org> [4.7+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 09f4be179ff3..7f47b7098b1b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -150,8 +150,9 @@ enum {
 	SWP_FILE	= (1 << 7),	/* set after swap_activate success */
 	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
 	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
+	SWP_STABLE_WRITES = (1 << 10),	/* no overwrite PG_writeback pages */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 10),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 11),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
-- 
cgit v1.2.3


From 575b1967e10a1f3038266244d2c7a3ca6b99fed8 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 10 Jan 2017 16:58:30 -0800
Subject: timerfd: export defines to userspace

Since userspace is expected to call timerfd syscalls directly with these
flags/ioctls, make sure we export them so they don't have to duplicate
the values themselves.

Link: http://lkml.kernel.org/r/20161219064052.7196-1-vapier@gentoo.org
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/timerfd.h | 20 +-------------------
 1 file changed, 1 insertion(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h
index bd36ce431e32..bab0b1ad0613 100644
--- a/include/linux/timerfd.h
+++ b/include/linux/timerfd.h
@@ -8,23 +8,7 @@
 #ifndef _LINUX_TIMERFD_H
 #define _LINUX_TIMERFD_H
 
-/* For O_CLOEXEC and O_NONBLOCK */
-#include <linux/fcntl.h>
-
-/* For _IO helpers */
-#include <linux/ioctl.h>
-
-/*
- * CAREFUL: Check include/asm-generic/fcntl.h when defining
- * new flags, since they might collide with O_* ones. We want
- * to re-use O_* flags that couldn't possibly have a meaning
- * from eventfd, in order to leave a free define-space for
- * shared O_* flags.
- */
-#define TFD_TIMER_ABSTIME (1 << 0)
-#define TFD_TIMER_CANCEL_ON_SET (1 << 1)
-#define TFD_CLOEXEC O_CLOEXEC
-#define TFD_NONBLOCK O_NONBLOCK
+#include <uapi/linux/timerfd.h>
 
 #define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK)
 /* Flags for timerfd_create.  */
@@ -32,6 +16,4 @@
 /* Flags for timerfd_settime.  */
 #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET)
 
-#define TFD_IOC_SET_TICKS	_IOW('T', 0, u64)
-
 #endif /* _LINUX_TIMERFD_H */
-- 
cgit v1.2.3


From b6416e61012429e0277bd15a229222fd17afc1c1 Mon Sep 17 00:00:00 2001
From: David Matlack <dmatlack@google.com>
Date: Fri, 16 Dec 2016 14:30:35 -0800
Subject: jump_labels: API for flushing deferred jump label updates

Modules that use static_key_deferred need a way to synchronize with
any delayed work that is still pending when the module is unloaded.
Introduce static_key_deferred_flush() which flushes any pending
jump label updates.

Signed-off-by: David Matlack <dmatlack@google.com>
Cc: stable@vger.kernel.org
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/jump_label_ratelimit.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index 089f70f83e97..23da3af459fe 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -14,6 +14,7 @@ struct static_key_deferred {
 
 #ifdef HAVE_JUMP_LABEL
 extern void static_key_slow_dec_deferred(struct static_key_deferred *key);
+extern void static_key_deferred_flush(struct static_key_deferred *key);
 extern void
 jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl);
 
@@ -26,6 +27,10 @@ static inline void static_key_slow_dec_deferred(struct static_key_deferred *key)
 	STATIC_KEY_CHECK_USE();
 	static_key_slow_dec(&key->key);
 }
+static inline void static_key_deferred_flush(struct static_key_deferred *key)
+{
+	STATIC_KEY_CHECK_USE();
+}
 static inline void
 jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
-- 
cgit v1.2.3


From f99e86485cc32cd16e5cc97f9bb0474f28608d84 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 12 Jan 2017 07:58:32 -0700
Subject: block: Rename blk_queue_zone_size and bdev_zone_size

All block device data fields and functions returning a number of 512B
sectors are by convention named xxx_sectors while names in the form
xxx_size are generally used for a number of bytes. The blk_queue_zone_size
and bdev_zone_size functions were not following this convention so rename
them.

No functional change is introduced by this patch.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>

Collapsed the two patches, they were nonsensically split and broke
bisection.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83695641bd5e..ff3d774f2751 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -739,7 +739,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 	}
 }
 
-static inline unsigned int blk_queue_zone_size(struct request_queue *q)
+static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
 {
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
@@ -1536,12 +1536,12 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
 	return false;
 }
 
-static inline unsigned int bdev_zone_size(struct block_device *bdev)
+static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
 	if (q)
-		return blk_queue_zone_size(q);
+		return blk_queue_zone_sectors(q);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 331c34255293cd02d395b7097008b509ba89e60e Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Wed, 4 Jan 2017 20:57:22 -0800
Subject: i2c: do not enable fall back to Host Notify by default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Falling back unconditionally to HostNotify as primary client's interrupt
breaks some drivers which alter their functionality depending on whether
interrupt is present or not, so let's introduce a board flag telling I2C
core explicitly if we want wired interrupt or HostNotify-based one:
I2C_CLIENT_HOST_NOTIFY.

For DT-based systems we introduce "host-notify" property that we convert
to I2C_CLIENT_HOST_NOTIFY board flag.

Tested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Acked-by: Pali Rohár <pali.rohar@gmail.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index b2109c522dec..4b45ec46161f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -665,6 +665,7 @@ i2c_unlock_adapter(struct i2c_adapter *adapter)
 #define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
 					/* Must equal I2C_M_TEN below */
 #define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
+#define I2C_CLIENT_HOST_NOTIFY	0x40	/* We want to use I2C host notify */
 #define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
 #define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
 					/* Must match I2C_M_STOP|IGNORE_NAK */
-- 
cgit v1.2.3


From 546125d1614264d26080817d0c8cddb9b25081fa Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Thu, 5 Jan 2017 16:34:51 -0500
Subject: sunrpc: don't call sleeping functions from the notifier block
 callbacks

The inet6addr_chain is an atomic notifier chain, so we can't call
anything that might sleep (like lock_sock)... instead of closing the
socket from svc_age_temp_xprts_now (which is called by the notifier
function), just have the rpc service threads do it instead.

Cc: stable@vger.kernel.org
Fixes: c3d4879e01be "sunrpc: Add a function to close..."
Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_xprt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index e5d193440374..7440290f64ac 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -66,6 +66,7 @@ struct svc_xprt {
 #define XPT_LISTENER	10		/* listening endpoint */
 #define XPT_CACHE_AUTH	11		/* cache auth info */
 #define XPT_LOCAL	12		/* connection from loopback interface */
+#define XPT_KILL_TEMP   13		/* call xpo_kill_temp_xprt before closing */
 
 	struct svc_serv		*xpt_server;	/* service for transport */
 	atomic_t    	    	xpt_reserved;	/* space on outq that is rsvd */
-- 
cgit v1.2.3


From 003c941057eaa868ca6fedd29a274c863167230d Mon Sep 17 00:00:00 2001
From: Shannon Nelson <shannon.nelson@oracle.com>
Date: Thu, 12 Jan 2017 14:24:58 -0800
Subject: tcp: fix tcp_fastopen unaligned access complaints on sparc

Fix up a data alignment issue on sparc by swapping the order
of the cookie byte array field with the length field in
struct tcp_fastopen_cookie, and making it a proper union
to clean up the typecasting.

This addresses log complaints like these:
    log_unaligned: 113 callbacks suppressed
    Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360
    Kernel unaligned access at TPC[9764ac] tcp_try_fastopen+0x2ec/0x360
    Kernel unaligned access at TPC[9764c8] tcp_try_fastopen+0x308/0x360
    Kernel unaligned access at TPC[9764e4] tcp_try_fastopen+0x324/0x360
    Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360

Cc: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Shannon Nelson <shannon.nelson@oracle.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index fc5848dad7a4..c93f4b3a59cb 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -62,8 +62,13 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
 
 /* TCP Fast Open Cookie as stored in memory */
 struct tcp_fastopen_cookie {
+	union {
+		u8	val[TCP_FASTOPEN_COOKIE_MAX];
+#if IS_ENABLED(CONFIG_IPV6)
+		struct in6_addr addr;
+#endif
+	};
 	s8	len;
-	u8	val[TCP_FASTOPEN_COOKIE_MAX];
 	bool	exp;	/* In RFC6994 experimental option format */
 };
 
-- 
cgit v1.2.3


From 2e3258ecfaebace1ceffaa14e0ea94775d54f46f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 13 Jan 2017 12:29:10 +0100
Subject: block: add blk_rq_payload_bytes

Add a helper to calculate the actual data transfer size for special
payload requests.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ff3d774f2751..1ca8e8fd1078 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1000,6 +1000,19 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+/*
+ * Some commands like WRITE SAME have a payload or data transfer size which
+ * is different from the size of the request.  Any driver that supports such
+ * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
+ * calculate the data transfer size.
+ */
+static inline unsigned int blk_rq_payload_bytes(struct request *rq)
+{
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
+		return rq->special_vec.bv_len;
+	return blk_rq_bytes(rq);
+}
+
 static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 						     int op)
 {
-- 
cgit v1.2.3


From 475113d937adfd150eb82b5e2c5507125a68e7af Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 28 Dec 2016 14:31:03 +0100
Subject: perf/x86/intel: Account interrupts for PEBS errors

It's possible to set up PEBS events to get only errors and not
any data, like on SNB-X (model 45) and IVB-EP (model 62)
via 2 perf commands running simultaneously:

    taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10

This leads to a soft lock up, because the error path of the
intel_pmu_drain_pebs_nhm() does not account event->hw.interrupt
for error PEBS interrupts, so in case you're getting ONLY
errors you don't have a way to stop the event when it's over
the max_samples_per_tick limit:

  NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816]
  ...
  RIP: 0010:[<ffffffff81159232>]  [<ffffffff81159232>] smp_call_function_single+0xe2/0x140
  ...
  Call Trace:
   ? trace_hardirqs_on_caller+0xf5/0x1b0
   ? perf_cgroup_attach+0x70/0x70
   perf_install_in_context+0x199/0x1b0
   ? ctx_resched+0x90/0x90
   SYSC_perf_event_open+0x641/0xf90
   SyS_perf_event_open+0x9/0x10
   do_syscall_64+0x6c/0x1f0
   entry_SYSCALL64_slow_path+0x25/0x25

Add perf_event_account_interrupt() which does the interrupt
and frequency checks and call it from intel_pmu_drain_pebs_nhm()'s
error path.

We keep the pending_kill and pending_wakeup logic only in the
__perf_event_overflow() path, because they make sense only if
there's any data to deliver.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vince@deater.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1482931866-6018-2-git-send-email-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4741ecdb9817..78ed8105e64d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event);
 extern void perf_event_disable_local(struct perf_event *event);
 extern void perf_event_disable_inatomic(struct perf_event *event);
 extern void perf_event_task_tick(void);
+extern int perf_event_account_interrupt(struct perf_event *event);
 #else /* !CONFIG_PERF_EVENTS: */
 static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
-- 
cgit v1.2.3


From 0100a3e67a9cef64d72cd3a1da86f3ddbee50363 Mon Sep 17 00:00:00 2001
From: Peter Jones <pjones@redhat.com>
Date: Mon, 12 Dec 2016 18:42:28 -0500
Subject: efi/x86: Prune invalid memory map entries and fix boot regression

Some machines, such as the Lenovo ThinkPad W541 with firmware GNET80WW
(2.28), include memory map entries with phys_addr=0x0 and num_pages=0.

These machines fail to boot after the following commit,

  commit 8e80632fb23f ("efi/esrt: Use efi_mem_reserve() and avoid a kmalloc()")

Fix this by removing such bogus entries from the memory map.

Furthermore, currently the log output for this case (with efi=debug)
looks like:

 [    0.000000] efi: mem45: [Reserved           |   |  |  |  |  |  |  |  |  |  |  |  ] range=[0x0000000000000000-0xffffffffffffffff] (0MB)

This is clearly wrong, and also not as informative as it could be.  This
patch changes it so that if we find obviously invalid memory map
entries, we print an error and skip those entries.  It also detects the
display of the address range calculation overflow, so the new output is:

 [    0.000000] efi: [Firmware Bug]: Invalid EFI memory map entries:
 [    0.000000] efi: mem45: [Reserved           |   |  |  |  |  |  |  |   |  |  |  |  ] range=[0x0000000000000000-0x0000000000000000] (invalid)

It also detects memory map sizes that would overflow the physical
address, for example phys_addr=0xfffffffffffff000 and
num_pages=0x0200000000000001, and prints:

 [    0.000000] efi: [Firmware Bug]: Invalid EFI memory map entries:
 [    0.000000] efi: mem45: [Reserved           |   |  |  |  |  |  |  |   |  |  |  |  ] range=[phys_addr=0xfffffffffffff000-0x20ffffffffffffffff] (invalid)

It then removes these entries from the memory map.

Signed-off-by: Peter Jones <pjones@redhat.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
[ardb: refactor for clarity with no functional changes, avoid PAGE_SHIFT]
Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
[Matt: Include bugzilla info in commit log]
Cc: <stable@vger.kernel.org> # v4.9+
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=191121
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/efi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0c5420208c40..5b1af30ece55 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -103,6 +103,7 @@ typedef	struct {
 
 #define EFI_PAGE_SHIFT		12
 #define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)
+#define EFI_PAGES_MAX		(U64_MAX >> EFI_PAGE_SHIFT)
 
 typedef struct {
 	u32 type;
-- 
cgit v1.2.3


From 4d22c75d4c7b5c5f4bd31054f09103ee490878fd Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <dave.kleikamp@oracle.com>
Date: Wed, 11 Jan 2017 13:25:00 -0600
Subject: coredump: Ensure proper size of sparse core files

If the last section of a core file ends with an unmapped or zero page,
the size of the file does not correspond with the last dump_skip() call.
gdb complains that the file is truncated and can be confusing to users.

After all of the vma sections are written, make sure that the file size
is no smaller than the current file position.

This problem can be demonstrated with gdb's bigcore testcase on the
sparc architecture.

Signed-off-by: Dave Kleikamp <dave.kleikamp@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/coredump.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index d016a121a8c4..28ffa94aed6b 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -14,6 +14,7 @@ struct coredump_params;
 extern int dump_skip(struct coredump_params *cprm, size_t nr);
 extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr);
 extern int dump_align(struct coredump_params *cprm, int align);
+extern void dump_truncate(struct coredump_params *cprm);
 #ifdef CONFIG_COREDUMP
 extern void do_coredump(const siginfo_t *siginfo);
 #else
-- 
cgit v1.2.3


From 52d7e48b86fc108e45a656d8e53e4237993c481d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 10 Jan 2017 02:28:26 -0800
Subject: rcu: Narrow early boot window of illegal synchronous grace periods

The current preemptible RCU implementation goes through three phases
during bootup.  In the first phase, there is only one CPU that is running
with preemption disabled, so that a no-op is a synchronous grace period.
In the second mid-boot phase, the scheduler is running, but RCU has
not yet gotten its kthreads spawned (and, for expedited grace periods,
workqueues are not yet running.  During this time, any attempt to do
a synchronous grace period will hang the system (or complain bitterly,
depending).  In the third and final phase, RCU is fully operational and
everything works normally.

This has been OK for some time, but there has recently been some
synchronous grace periods showing up during the second mid-boot phase.
This code worked "by accident" for awhile, but started failing as soon
as expedited RCU grace periods switched over to workqueues in commit
8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue").
Note that the code was buggy even before this commit, as it was subject
to failure on real-time systems that forced all expedited grace periods
to run as normal grace periods (for example, using the rcu_normal ksysfs
parameter).  The callchain from the failure case is as follows:

early_amd_iommu_init()
|-> acpi_put_table(ivrs_base);
|-> acpi_tb_put_table(table_desc);
|-> acpi_tb_invalidate_table(table_desc);
|-> acpi_tb_release_table(...)
|-> acpi_os_unmap_memory
|-> acpi_os_unmap_iomem
|-> acpi_os_map_cleanup
|-> synchronize_rcu_expedited

The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y,
which caused the code to try using workqueues before they were
initialized, which did not go well.

This commit therefore reworks RCU to permit synchronous grace periods
to proceed during this mid-boot phase.  This commit is therefore a
fix to a regression introduced in v4.9, and is therefore being put
forward post-merge-window in v4.10.

This commit sets a flag from the existing rcu_scheduler_starting()
function which causes all synchronous grace periods to take the expedited
path.  The expedited path now checks this flag, using the requesting task
to drive the expedited grace period forward during the mid-boot phase.
Finally, this flag is updated by a core_initcall() function named
rcu_exp_runtime_mode(), which causes the runtime codepaths to be used.

Note that this arrangement assumes that tasks are not sent POSIX signals
(or anything similar) from the time that the first task is spawned
through core_initcall() time.

Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue")
Reported-by: "Zheng, Lv" <lv.zheng@intel.com>
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Stan Kain <stan.kain@gmail.com>
Tested-by: Ivan <waffolz@hotmail.com>
Tested-by: Emanuel Castelo <emanuel.castelo@gmail.com>
Tested-by: Bruno Pesavento <bpesavento@infinito.it>
Tested-by: Borislav Petkov <bp@suse.de>
Tested-by: Frederic Bezies <fredbezies@gmail.com>
Cc: <stable@vger.kernel.org> # 4.9.0-
---
 include/linux/rcupdate.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 321f9ed552a9..01f71e1d2e94 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -444,6 +444,10 @@ bool __rcu_is_watching(void);
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
 
+#define RCU_SCHEDULER_INACTIVE	0
+#define RCU_SCHEDULER_INIT	1
+#define RCU_SCHEDULER_RUNNING	2
+
 /*
  * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
  * initialization and destruction of rcu_head on the stack. rcu_head structures
-- 
cgit v1.2.3


From 4205e4786d0b9fc3b4fec7b1910cf645a0468307 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 10 Jan 2017 14:01:05 +0100
Subject: cpu/hotplug: Provide dynamic range for prepare stage

Mathieu reported that the LTTNG modules are broken as of 4.10-rc1 due to
the removal of the cpu hotplug notifiers.

Usually I don't care much about out of tree modules, but LTTNG is widely
used in distros. There are two ways to solve that:

1) Reserve a hotplug state for LTTNG

2) Add a dynamic range for the prepare states.

While #1 is the simplest solution, #2 is the proper one as we can convert
in tree users, which do not care about ordering, to the dynamic range as
well.

Add a dynamic range which allows LTTNG to request states in the prepare
stage.

Reported-and-tested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Sewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1701101353010.3401@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 20bfefbe7594..d936a0021839 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -74,6 +74,8 @@ enum cpuhp_state {
 	CPUHP_ZCOMP_PREPARE,
 	CPUHP_TIMERS_DEAD,
 	CPUHP_MIPS_SOC_PREPARE,
+	CPUHP_BP_PREPARE_DYN,
+	CPUHP_BP_PREPARE_DYN_END		= CPUHP_BP_PREPARE_DYN + 20,
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
-- 
cgit v1.2.3


From f1f7714ea51c56b7163fb1a5acf39c6a204dd758 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 Jan 2017 23:38:15 +0100
Subject: bpf: rework prog_digest into prog_tag

Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
fdinfo/netlink") was recently discussed, partially due to
admittedly suboptimal name of "prog_digest" in combination
with sha1 hash usage, thus inevitably and rightfully concerns
about its security in terms of collision resistance were
raised with regards to use-cases.

The intended use cases are for debugging resp. introspection
only for providing a stable "tag" over the instruction sequence
that both kernel and user space can calculate independently.
It's not usable at all for making a security relevant decision.
So collisions where two different instruction sequences generate
the same tag can happen, but ideally at a rather low rate. The
"tag" will be dumped in hex and is short enough to introspect
in tracepoints or kallsyms output along with other data such
as stack trace, etc. Thus, this patch performs a rename into
prog_tag and truncates the tag to a short output (64 bits) to
make it obvious it's not collision-free.

Should in future a hash or facility be needed with a security
relevant focus, then we can think about requirements, constraints,
etc that would fit to that situation. For now, rework the exposed
parts for the current use cases as long as nothing has been
released yet. Tested on x86_64 and s390x.

Fixes: 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h    | 2 +-
 include/linux/filter.h | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f74ae68086dc..05cf951df3fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,7 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
-int bpf_prog_calc_digest(struct bpf_prog *fp);
+int bpf_prog_calc_tag(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a0934e6c9bab..e4eb2546339a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -57,6 +57,8 @@ struct bpf_prog_aux;
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
+#define BPF_TAG_SIZE	8
+
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -408,7 +410,7 @@ struct bpf_prog {
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
-	u32			digest[SHA_DIGEST_WORDS]; /* Program digest */
+	u8			tag[BPF_TAG_SIZE];
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	unsigned int		(*bpf_func)(const void *ctx,
@@ -519,7 +521,7 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
 	return prog->len * sizeof(struct bpf_insn);
 }
 
-static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog)
+static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog)
 {
 	return round_up(bpf_prog_insn_size(prog) +
 			sizeof(__be64) + 1, SHA_MESSAGE_BYTES);
-- 
cgit v1.2.3


From 5eb7c0d04f04a667c049fe090a95494a8de2955c Mon Sep 17 00:00:00 2001
From: Larry Finger <Larry.Finger@lwfinger.net>
Date: Sun, 1 Jan 2017 20:25:25 -0600
Subject: taint/module: Fix problems when out-of-kernel driver defines true or
 false

Commit 7fd8329ba502 ("taint/module: Clean up global and module taint
flags handling") used the key words true and false as character members
of a new struct. These names cause problems when out-of-kernel modules
such as VirtualBox include their own definitions of true and false.

Fixes: 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling")
Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jessica Yu <jeyu@redhat.com>
---
 include/linux/kernel.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 56aec84237ad..cb09238f6d32 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -514,8 +514,8 @@ extern enum system_states {
 #define TAINT_FLAGS_COUNT		16
 
 struct taint_flag {
-	char true;	/* character printed when tainted */
-	char false;	/* character printed when not tainted */
+	char c_true;	/* character printed when tainted */
+	char c_false;	/* character printed when not tainted */
 	bool module;	/* also show as a per-module taint flag */
 };
 
-- 
cgit v1.2.3


From 501db511397fd6efff3aa5b4e8de415b55559550 Mon Sep 17 00:00:00 2001
From: Rolf Neugebauer <rolf.neugebauer@docker.com>
Date: Tue, 17 Jan 2017 18:13:51 +0000
Subject: virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit

This patch part reverts fd2a0437dc33 and e858fae2b0b8 which introduced a
subtle change in how the virtio_net flags are derived from the SKBs
ip_summed field.

With the above commits, the flags are set to VIRTIO_NET_HDR_F_DATA_VALID
when ip_summed == CHECKSUM_UNNECESSARY, thus treating it differently to
ip_summed == CHECKSUM_NONE, which should be the same.

Further, the virtio spec 1.0 / CS04 explicitly says that
VIRTIO_NET_HDR_F_DATA_VALID must not be set by the driver.

Fixes: fd2a0437dc33 ("virtio_net: introduce virtio_net_hdr_{from,to}_skb")
Fixes: e858fae2b0b8 (" virtio_net: use common code for virtio_net_hdr and skb GSO conversion")
Signed-off-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 66204007d7ac..56436472ccc7 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -91,8 +91,6 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 				skb_checksum_start_offset(skb));
 		hdr->csum_offset = __cpu_to_virtio16(little_endian,
 				skb->csum_offset);
-	} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
-		hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
 	} /* else everything is zero */
 
 	return 0;
-- 
cgit v1.2.3


From d407bd25a204bd66b7346dde24bd3d37ef0e0b05 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 18 Jan 2017 15:14:17 +0100
Subject: bpf: don't trigger OOM killer under pressure with map alloc

This patch adds two helpers, bpf_map_area_alloc() and bpf_map_area_free(),
that are to be used for map allocations. Using kmalloc() for very large
allocations can cause excessive work within the page allocator, so i) fall
back earlier to vmalloc() when the attempt is considered costly anyway,
and even more importantly ii) don't trigger OOM killer with any of the
allocators.

Since this is based on a user space request, for example, when creating
maps with element pre-allocation, we really want such requests to fail
instead of killing other user space processes.

Also, don't spam the kernel log with warnings should any of the allocations
fail under pressure. Given that, we can make backend selection in
bpf_map_area_alloc() generic, and convert all maps over to use this API
for spots with potentially large allocation requests.

Note, replacing the one kmalloc_array() is fine as overflow checks happen
earlier in htab_map_alloc(), since it must also protect the multiplication
for vmalloc() should kmalloc_array() fail.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 05cf951df3fe..3ed1f3b1d594 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -247,6 +247,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
+void *bpf_map_area_alloc(size_t size);
+void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-- 
cgit v1.2.3


From 739e6f5945d88dcee01590913f6886132a10c215 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 11 Jan 2017 13:37:07 +0100
Subject: gpio: provide lockdep keys for nested/unnested irqchips

The helper function for adding a GPIO chip compiles in a lockdep
key for debugging, the same key is needed for nested chips as
well.

The macro construction is unreadable, replace this with two
static inlines instead.

The _gpiochip_irqchip_add prefixed function is not helpful,
rename it with gpiochip_irqchip_add_key() that tell us what the
function is actually doing.

Fixes: d245b3f9bd36 ("gpio: simplify adding threaded interrupts")
Cc: Roger Quadros <rogerq@ti.com>
Reported-by: Clemens Gruber <clemens.gruber@pqgruber.com>
Reported-by: Roger Quadros <rogerq@ti.com>
Reported-by: Grygorii Strashko <grygorii.strashko@ti.com>
Tested-by: Clemens Gruber <clemens.gruber@pqgruber.com>
Tested-by: Grygorii Strashko <grygorii.strashko@ti.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/driver.h | 70 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index c2748accea71..e973faba69dc 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -274,37 +274,67 @@ void gpiochip_set_nested_irqchip(struct gpio_chip *gpiochip,
 		struct irq_chip *irqchip,
 		int parent_irq);
 
-int _gpiochip_irqchip_add(struct gpio_chip *gpiochip,
+int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip,
+			     struct irq_chip *irqchip,
+			     unsigned int first_irq,
+			     irq_flow_handler_t handler,
+			     unsigned int type,
+			     bool nested,
+			     struct lock_class_key *lock_key);
+
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * Lockdep requires that each irqchip instance be created with a
+ * unique key so as to avoid unnecessary warnings. This upfront
+ * boilerplate static inlines provides such a key for each
+ * unique instance.
+ */
+static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
+				       struct irq_chip *irqchip,
+				       unsigned int first_irq,
+				       irq_flow_handler_t handler,
+				       unsigned int type)
+{
+	static struct lock_class_key key;
+
+	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
+					handler, type, false, &key);
+}
+
+static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  struct irq_chip *irqchip,
 			  unsigned int first_irq,
 			  irq_flow_handler_t handler,
-			  unsigned int type,
-			  bool nested,
-			  struct lock_class_key *lock_key);
+			  unsigned int type)
+{
+
+	static struct lock_class_key key;
+
+	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
+					handler, type, true, &key);
+}
+#else
+static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip,
+				       struct irq_chip *irqchip,
+				       unsigned int first_irq,
+				       irq_flow_handler_t handler,
+				       unsigned int type)
+{
+	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
+					handler, type, false, NULL);
+}
 
-/* FIXME: I assume threaded IRQchips do not have the lockdep problem */
 static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip,
 			  struct irq_chip *irqchip,
 			  unsigned int first_irq,
 			  irq_flow_handler_t handler,
 			  unsigned int type)
 {
-	return _gpiochip_irqchip_add(gpiochip, irqchip, first_irq,
-				     handler, type, true, NULL);
+	return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq,
+					handler, type, true, NULL);
 }
-
-#ifdef CONFIG_LOCKDEP
-#define gpiochip_irqchip_add(...)				\
-(								\
-	({							\
-		static struct lock_class_key _key;		\
-		_gpiochip_irqchip_add(__VA_ARGS__, false, &_key); \
-	})							\
-)
-#else
-#define gpiochip_irqchip_add(...)				\
-	_gpiochip_irqchip_add(__VA_ARGS__, false, NULL)
-#endif
+#endif /* CONFIG_LOCKDEP */
 
 #endif /* CONFIG_GPIOLIB_IRQCHIP */
 
-- 
cgit v1.2.3


From e326ce013a8e851193eb337aafb1aa396c533a61 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 20 Jan 2017 03:25:34 +0100
Subject: Revert "PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag"

Revert commit 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0
flag) as it caused system suspend (in the default configuration) to fail
on Dell XPS13 (9360) with the Kaby Lake processor.

Fixes: 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag)
Reported-by: Paul Menzel <pmenzel@molgen.mpg.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 0c729c3c8549..d9718378a8be 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -194,8 +194,6 @@ struct platform_freeze_ops {
 };
 
 #ifdef CONFIG_SUSPEND
-extern suspend_state_t mem_sleep_default;
-
 /**
  * suspend_set_ops - set platform dependent suspend operations
  * @ops: The new suspend operations to set.
-- 
cgit v1.2.3


From 6391a4481ba0796805d6581e42f9f0418c099e34 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 20 Jan 2017 14:32:42 +0800
Subject: virtio-net: restore VIRTIO_HDR_F_DATA_VALID on receiving

Commit 501db511397f ("virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on
xmit") in fact disables VIRTIO_HDR_F_DATA_VALID on receiving path too,
fixing this by adding a hint (has_data_valid) and set it only on the
receiving path.

Cc: Rolf Neugebauer <rolf.neugebauer@docker.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 56436472ccc7..5209b5ed2a64 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -56,7 +56,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 
 static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
-					  bool little_endian)
+					  bool little_endian,
+					  bool has_data_valid)
 {
 	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
@@ -91,6 +92,9 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 				skb_checksum_start_offset(skb));
 		hdr->csum_offset = __cpu_to_virtio16(little_endian,
 				skb->csum_offset);
+	} else if (has_data_valid &&
+		   skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
 	} /* else everything is zero */
 
 	return 0;
-- 
cgit v1.2.3


From 059aa734824165507c65fd30a55ff000afd14983 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sun, 22 Jan 2017 14:04:29 -0500
Subject: nfs: Don't increment lock sequence ID after NFS4ERR_MOVED

Xuan Qi reports that the Linux NFSv4 client failed to lock a file
that was migrated. The steps he observed on the wire:

1. The client sent a LOCK request to the source server
2. The source server replied NFS4ERR_MOVED
3. The client switched to the destination server
4. The client sent the same LOCK request to the destination
   server with a bumped lock sequence ID
5. The destination server rejected the LOCK request with
   NFS4ERR_BAD_SEQID

RFC 3530 section 8.1.5 provides a list of NFS errors which do not
bump a lock sequence ID.

However, RFC 3530 is now obsoleted by RFC 7530. In RFC 7530 section
9.1.7, this list has been updated by the addition of NFS4ERR_MOVED.

Reported-by: Xuan Qi <xuan.qi@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: stable@vger.kernel.org # v3.7+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/nfs4.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index bca536341d1a..1b1ca04820a3 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -282,7 +282,7 @@ enum nfsstat4 {
 
 static inline bool seqid_mutating_err(u32 err)
 {
-	/* rfc 3530 section 8.1.5: */
+	/* See RFC 7530, section 9.1.7 */
 	switch (err) {
 	case NFS4ERR_STALE_CLIENTID:
 	case NFS4ERR_STALE_STATEID:
@@ -291,6 +291,7 @@ static inline bool seqid_mutating_err(u32 err)
 	case NFS4ERR_BADXDR:
 	case NFS4ERR_RESOURCE:
 	case NFS4ERR_NOFILEHANDLE:
+	case NFS4ERR_MOVED:
 		return false;
 	};
 	return true;
-- 
cgit v1.2.3


From c929ea0b910355e1876c64431f3d5802f95b3d75 Mon Sep 17 00:00:00 2001
From: Kinglong Mee <kinglongmee@gmail.com>
Date: Fri, 20 Jan 2017 16:48:39 +0800
Subject: SUNRPC: cleanup ida information when removing sunrpc module

After removing sunrpc module, I get many kmemleak information as,
unreferenced object 0xffff88003316b1e0 (size 544):
  comm "gssproxy", pid 2148, jiffies 4294794465 (age 4200.081s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffffb0cfb58a>] kmemleak_alloc+0x4a/0xa0
    [<ffffffffb03507fe>] kmem_cache_alloc+0x15e/0x1f0
    [<ffffffffb0639baa>] ida_pre_get+0xaa/0x150
    [<ffffffffb0639cfd>] ida_simple_get+0xad/0x180
    [<ffffffffc06054fb>] nlmsvc_lookup_host+0x4ab/0x7f0 [lockd]
    [<ffffffffc0605e1d>] lockd+0x4d/0x270 [lockd]
    [<ffffffffc06061e5>] param_set_timeout+0x55/0x100 [lockd]
    [<ffffffffc06cba24>] svc_defer+0x114/0x3f0 [sunrpc]
    [<ffffffffc06cbbe7>] svc_defer+0x2d7/0x3f0 [sunrpc]
    [<ffffffffc06c71da>] rpc_show_info+0x8a/0x110 [sunrpc]
    [<ffffffffb044a33f>] proc_reg_write+0x7f/0xc0
    [<ffffffffb038e41f>] __vfs_write+0xdf/0x3c0
    [<ffffffffb0390f1f>] vfs_write+0xef/0x240
    [<ffffffffb0392fbd>] SyS_write+0xad/0x130
    [<ffffffffb0d06c37>] entry_SYSCALL_64_fastpath+0x1a/0xa9
    [<ffffffffffffffff>] 0xffffffffffffffff

I found, the ida information (dynamic memory) isn't cleanup.

Signed-off-by: Kinglong Mee <kinglongmee@gmail.com>
Fixes: 2f048db4680a ("SUNRPC: Add an identifier for struct rpc_clnt")
Cc: stable@vger.kernel.org # v3.12+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 85cc819676e8..333ad11b3dd9 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -216,5 +216,6 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
 void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
 bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
 			const struct sockaddr *sap);
+void rpc_cleanup_clids(void);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
-- 
cgit v1.2.3


From 8a1f780e7f28c7c1d640118242cf68d528c456cd Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Date: Tue, 24 Jan 2017 15:17:45 -0800
Subject: memory_hotplug: make zone_can_shift() return a boolean value

online_{kernel|movable} is used to change the memory zone to
ZONE_{NORMAL|MOVABLE} and online the memory.

To check that memory zone can be changed, zone_can_shift() is used.
Currently the function returns minus integer value, plus integer
value and 0. When the function returns minus or plus integer value,
it means that the memory zone can be changed to ZONE_{NORNAL|MOVABLE}.

But when the function returns 0, there are two meanings.

One of the meanings is that the memory zone does not need to be changed.
For example, when memory is in ZONE_NORMAL and onlined by online_kernel
the memory zone does not need to be changed.

Another meaning is that the memory zone cannot be changed. When memory
is in ZONE_NORMAL and onlined by online_movable, the memory zone may
not be changed to ZONE_MOVALBE due to memory online limitation(see
Documentation/memory-hotplug.txt). In this case, memory must not be
onlined.

The patch changes the return type of zone_can_shift() so that memory
online operation fails when memory zone cannot be changed as follows:

Before applying patch:
   # grep -A 35 "Node 2" /proc/zoneinfo
   Node 2, zone   Normal
   <snip>
      node_scanned  0
           spanned  8388608
           present  7864320
           managed  7864320
   # echo online_movable > memory4097/state
   # grep -A 35 "Node 2" /proc/zoneinfo
   Node 2, zone   Normal
   <snip>
      node_scanned  0
           spanned  8388608
           present  8388608
           managed  8388608

   online_movable operation succeeded. But memory is onlined as
   ZONE_NORMAL, not ZONE_MOVABLE.

After applying patch:
   # grep -A 35 "Node 2" /proc/zoneinfo
   Node 2, zone   Normal
   <snip>
      node_scanned  0
           spanned  8388608
           present  7864320
           managed  7864320
   # echo online_movable > memory4097/state
   bash: echo: write error: Invalid argument
   # grep -A 35 "Node 2" /proc/zoneinfo
   Node 2, zone   Normal
   <snip>
      node_scanned  0
           spanned  8388608
           present  7864320
           managed  7864320

   online_movable operation failed because of failure of changing
   the memory zone from ZONE_NORMAL to ZONE_MOVABLE

Fixes: df429ac03936 ("memory-hotplug: more general validation of zone during online")
Link: http://lkml.kernel.org/r/2f9c3837-33d7-b6e5-59c0-6ca4372b2d84@gmail.com
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Reviewed-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 01033fadea47..c1784c0b4f35 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -284,7 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 		unsigned long map_offset);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
-extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
-			  enum zone_type target);
+extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
+			  enum zone_type target, int *zone_shift);
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
-- 
cgit v1.2.3


From b94f51183b0617e7b9b4fb4137d4cf1cab7547c2 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Tue, 24 Jan 2017 15:17:53 -0800
Subject: kernel/watchdog: prevent false hardlockup on overloaded system

On an overloaded system, it is possible that a change in the watchdog
threshold can be delayed long enough to trigger a false positive.

This can easily be achieved by having a cpu spinning indefinitely on a
task, while another cpu updates watchdog threshold.

What happens is while trying to park the watchdog threads, the hrtimers
on the other cpus trigger and reprogram themselves with the new slower
watchdog threshold.  Meanwhile, the nmi watchdog is still programmed
with the old faster threshold.

Because the one cpu is blocked, it prevents the thread parking on the
other cpus from completing, which is needed to shutdown the nmi watchdog
and reprogram it correctly.  As a result, a false positive from the nmi
watchdog is reported.

Fix this by setting a park_in_progress flag to block all lockups until
the parking is complete.

Fix provided by Ulrich Obergfell.

[akpm@linux-foundation.org: s/park_in_progress/watchdog_park_in_progress/]
Link: http://lkml.kernel.org/r/1481041033-192236-1-git-send-email-dzickus@redhat.com
Signed-off-by: Don Zickus <dzickus@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index aacca824a6ae..0a3fadc32693 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -110,6 +110,7 @@ extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 extern unsigned long watchdog_enabled;
 extern unsigned long *watchdog_cpumask_bits;
+extern atomic_t watchdog_park_in_progress;
 #ifdef CONFIG_SMP
 extern int sysctl_softlockup_all_cpu_backtrace;
 extern int sysctl_hardlockup_all_cpu_backtrace;
-- 
cgit v1.2.3


From ea57485af8f4221312a5a95d63c382b45e7840dc Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 24 Jan 2017 15:18:32 -0800
Subject: mm, page_alloc: fix check for NULL preferred_zone

Patch series "fix premature OOM regression in 4.7+ due to cpuset races".

This is v2 of my attempt to fix the recent report based on LTP cpuset
stress test [1].  The intention is to go to stable 4.9 LTSS with this,
as triggering repeated OOMs is not nice.  That's why the patches try to
be not too intrusive.

Unfortunately why investigating I found that modifying the testcase to
use per-VMA policies instead of per-task policies will bring the OOM's
back, but that seems to be much older and harder to fix problem.  I have
posted a RFC [2] but I believe that fixing the recent regressions has a
higher priority.

Longer-term we might try to think how to fix the cpuset mess in a better
and less error prone way.  I was for example very surprised to learn,
that cpuset updates change not only task->mems_allowed, but also
nodemask of mempolicies.  Until now I expected the parameter to
alloc_pages_nodemask() to be stable.  I wonder why do we then treat
cpusets specially in get_page_from_freelist() and distinguish HARDWALL
etc, when there's unconditional intersection between mempolicy and
cpuset.  I would expect the nodemask adjustment for saving overhead in
g_p_f(), but that clearly doesn't happen in the current form.  So we
have both crazy complexity and overhead, AFAICS.

[1] https://lkml.kernel.org/r/CAFpQJXUq-JuEP=QPidy4p_=FN0rkH5Z-kfB4qBvsf6jMS87Edg@mail.gmail.com
[2] https://lkml.kernel.org/r/7c459f26-13a6-a817-e508-b65b903a8378@suse.cz

This patch (of 4):

Since commit c33d6c06f60f ("mm, page_alloc: avoid looking up the first
zone in a zonelist twice") we have a wrong check for NULL preferred_zone,
which can theoretically happen due to concurrent cpuset modification.  We
check the zoneref pointer which is never NULL and we should check the zone
pointer.  Also document this in first_zones_zonelist() comment per Michal
Hocko.

Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice")
Link: http://lkml.kernel.org/r/20170120103843.24587-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Ganapatrao Kulkarni <gpkulkarni@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 36d9896fbc1e..f4aac87adcc3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -972,12 +972,16 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
  * @zonelist - The zonelist to search for a suitable zone
  * @highest_zoneidx - The zone index of the highest zone to return
  * @nodes - An optional nodemask to filter the zonelist with
- * @zone - The first suitable zone found is returned via this parameter
+ * @return - Zoneref pointer for the first suitable zone found (see below)
  *
  * This function returns the first zone at or below a given zone index that is
  * within the allowed nodemask. The zoneref returned is a cursor that can be
  * used to iterate the zonelist with next_zones_zonelist by advancing it by
  * one before calling.
+ *
+ * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
+ * never NULL). This may happen either genuinely, or due to concurrent nodemask
+ * update due to cpuset modification.
  */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx,
-- 
cgit v1.2.3


From d6f8cfa3dea294eabf8f302e90176dd6381fb66e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 25 Jan 2017 11:39:49 +0100
Subject: net: phy: leds: Break dependency of phy.h on phy_led_triggers.h

<linux/phy.h> includes <linux/phy_led_triggers.h>, which is not really
needed.  Drop the include from <linux/phy.h>, and add it to all users
that didn't include it explicitly.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index f7d95f644eed..7fc1105605bf 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -25,7 +25,6 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/mod_devicetable.h>
-#include <linux/phy_led_triggers.h>
 
 #include <linux/atomic.h>
 
-- 
cgit v1.2.3


From 3c880eb0205222bb062970085ebedc73ec8dfd14 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 25 Jan 2017 11:39:50 +0100
Subject: net: phy: leds: Fix truncated LED trigger names

Commit 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and
bus_id") increased the size of MII bus IDs, but forgot to update the
private definition in <linux/phy_led_triggers.h>.
This may cause:
  1. Truncation of LED trigger names,
  2. Duplicate LED trigger names,
  3. Failures registering LED triggers,
  4. Crashes due to bad error handling in the LED trigger failure path.

To fix this, and prevent the definitions going out of sync again in the
future, let the PHY LED trigger code use the existing MII_BUS_ID_SIZE
definition.

Example:
  - Before I had triggers "ee700000.etherne:01:100Mbps" and
    "ee700000.etherne:01:10Mbps",
  - After the increase of MII_BUS_ID_SIZE, both became
    "ee700000.ethernet-ffffffff:01:" => FAIL,
  - Now, the triggers are "ee700000.ethernet-ffffffff:01:100Mbps" and
    "ee700000.ethernet-ffffffff:01:10Mbps", which are unique again.

Fixes: 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and bus_id")
Fixes: 2e0bc452f4721520 ("net: phy: leds: add support for led triggers on phy link state change")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy_led_triggers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy_led_triggers.h b/include/linux/phy_led_triggers.h
index a2daea0a37d2..b37b05bfd1a6 100644
--- a/include/linux/phy_led_triggers.h
+++ b/include/linux/phy_led_triggers.h
@@ -18,11 +18,11 @@ struct phy_device;
 #ifdef CONFIG_LED_TRIGGER_PHY
 
 #include <linux/leds.h>
+#include <linux/phy.h>
 
 #define PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE	10
-#define PHY_MII_BUS_ID_SIZE	(20 - 3)
 
-#define PHY_LINK_LED_TRIGGER_NAME_SIZE (PHY_MII_BUS_ID_SIZE + \
+#define PHY_LINK_LED_TRIGGER_NAME_SIZE (MII_BUS_ID_SIZE + \
 				       FIELD_SIZEOF(struct mdio_device, addr)+\
 				       PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE)
 
-- 
cgit v1.2.3


From 9d162ed69f51cbd9ee5a0c7e82aba7acc96362ff Mon Sep 17 00:00:00 2001
From: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Date: Fri, 27 Jan 2017 08:46:23 +0100
Subject: net: phy: micrel: add support for KSZ8795

This is adds support for the PHYs in the KSZ8795 5port managed switch.

It will allow to detect the link between the switch and the soc
and uses the same read_status functions as the KSZ8873MLL switch.

Signed-off-by: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/micrel_phy.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 257173e0095e..f541da68d1e7 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -35,6 +35,8 @@
 #define PHY_ID_KSZ886X		0x00221430
 #define PHY_ID_KSZ8863		0x00221435
 
+#define PHY_ID_KSZ8795		0x00221550
+
 /* struct phy_device dev_flags definitions */
 #define MICREL_PHY_50MHZ_CLK	0x00000001
 #define MICREL_PHY_FXEN		0x00000002
-- 
cgit v1.2.3