From ae7871be189cb41184f1e05742b4a99e2c59774d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 16 Dec 2016 14:28:41 +0100 Subject: swiotlb: Convert swiotlb_force from int to enum Convert the flag swiotlb_force from an int to an enum, to prepare for the advent of more possible values. Suggested-by: Konrad Rzeszutek Wilk Signed-off-by: Geert Uytterhoeven Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 183f37c8a5e1..71d104e4c849 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -9,7 +9,12 @@ struct device; struct page; struct scatterlist; -extern int swiotlb_force; +enum swiotlb_force { + SWIOTLB_NORMAL, /* Default - depending on HW DMA mask etc. */ + SWIOTLB_FORCE, /* swiotlb=force */ +}; + +extern enum swiotlb_force swiotlb_force; /* * Maximum allowable number of contiguous slabs to map, -- cgit v1.2.3 From fff5d99225107f5f13fe4a9805adc2a1c4b5fb00 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 16 Dec 2016 14:28:42 +0100 Subject: swiotlb: Add swiotlb=noforce debug option On architectures like arm64, swiotlb is tied intimately to the core architecture DMA support. In addition, ZONE_DMA cannot be disabled. To aid debugging and catch devices not supporting DMA to memory outside the 32-bit address space, add a kernel command line option "swiotlb=noforce", which disables the use of bounce buffers. If specified, trying to map memory that cannot be used with DMA will fail, and a rate-limited warning will be printed. Note that io_tlb_nslabs is set to 1, which is the minimal supported value. Signed-off-by: Geert Uytterhoeven Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 71d104e4c849..d9c84a9cde3d 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -12,6 +12,7 @@ struct scatterlist; enum swiotlb_force { SWIOTLB_NORMAL, /* Default - depending on HW DMA mask etc. */ SWIOTLB_FORCE, /* swiotlb=force */ + SWIOTLB_NO_FORCE, /* swiotlb=noforce */ }; extern enum swiotlb_force swiotlb_force; -- cgit v1.2.3 From 72c5296f9d64d8f5f27c2133e5f108a45a353d71 Mon Sep 17 00:00:00 2001 From: Jon Derrick Date: Tue, 20 Dec 2016 14:38:14 -0700 Subject: genhd: remove dead and duplicated scsi code blk_scsi_cmd_filter use was deprecated by 4beab5c6 and the SCSI macros are duplicated in blkdev.h, both likely reintroduced by a bad merge from 540eed56. Signed-off-by: Jon Derrick Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/genhd.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e0341af6950e..76f39754e7b0 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -146,15 +146,6 @@ enum { DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ }; -#define BLK_SCSI_MAX_CMDS (256) -#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) - -struct blk_scsi_cmd_filter { - unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; - unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; - struct kobject kobj; -}; - struct disk_part_tbl { struct rcu_head rcu_head; int len; -- cgit v1.2.3 From e3ba730702af370563f66cb610b71aa0ca67955e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Dec 2016 10:15:20 +0100 Subject: fsnotify: Remove fsnotify_duplicate_mark() There are only two calls sites of fsnotify_duplicate_mark(). Those are in kernel/audit_tree.c and both are bogus. Vfsmount pointer is unused for audit tree, inode pointer and group gets set in fsnotify_add_mark_locked() later anyway, mask and free_mark are already set in alloc_chunk(). In fact, calling fsnotify_duplicate_mark() is actively harmful because following fsnotify_add_mark_locked() will leak group reference by overwriting the group pointer. So just remove the two calls to fsnotify_duplicate_mark() and the function. Signed-off-by: Jan Kara [PM: line wrapping to fit in 80 chars] Signed-off-by: Paul Moore --- include/linux/fsnotify_backend.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7268ed076be8..ce77caa2bb10 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -324,8 +324,6 @@ extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(str extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode); /* find (and take a reference) to a mark associated with group and vfsmount */ extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt); -/* copy the values from old into new */ -extern void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old); /* set the ignored_mask of a mark */ extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask); /* set the mask of a mark (might pin the object into memory */ -- cgit v1.2.3 From 1efbd205b3cc5882a8c386c58a57134044e9d5ba Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 28 Dec 2016 14:58:39 +0200 Subject: Revert "net/mlx5: Add MPCNT register infrastructure" This reverts commit 7f503169cabd70c1f13b9279c50eca7dfb9a7d51. Fixes: 7f503169cabd ("net/mlx5: Add MPCNT register infrastructure") Signed-off-by: Gal Pressman Reported-by: Jesper Dangaard Brouer Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 5 --- include/linux/mlx5/driver.h | 1 - include/linux/mlx5/mlx5_ifc.h | 93 ------------------------------------------- 3 files changed, 99 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9f489365b3d3..52b437431c6a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1071,11 +1071,6 @@ enum { MLX5_INFINIBAND_PORT_COUNTERS_GROUP = 0x20, }; -enum { - MLX5_PCIE_PERFORMANCE_COUNTERS_GROUP = 0x0, - MLX5_PCIE_TIMERS_AND_STATES_COUNTERS_GROUP = 0x2, -}; - static inline u16 mlx5_to_sw_pkey_sz(int pkey_sz) { if (pkey_sz > MLX5_MAX_LOG_PKEY_TABLE) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0ae55361e674..735b36335f29 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -123,7 +123,6 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, MLX5_REG_MCIA = 0x9014, MLX5_REG_MLCR = 0x902b, - MLX5_REG_MPCNT = 0x9051, }; enum mlx5_dcbx_oper_mode { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 57bec544e20a..a852e9db6f0d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1757,80 +1757,6 @@ struct mlx5_ifc_eth_802_3_cntrs_grp_data_layout_bits { u8 reserved_at_4c0[0x300]; }; -struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits { - u8 life_time_counter_high[0x20]; - - u8 life_time_counter_low[0x20]; - - u8 rx_errors[0x20]; - - u8 tx_errors[0x20]; - - u8 l0_to_recovery_eieos[0x20]; - - u8 l0_to_recovery_ts[0x20]; - - u8 l0_to_recovery_framing[0x20]; - - u8 l0_to_recovery_retrain[0x20]; - - u8 crc_error_dllp[0x20]; - - u8 crc_error_tlp[0x20]; - - u8 reserved_at_140[0x680]; -}; - -struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits { - u8 life_time_counter_high[0x20]; - - u8 life_time_counter_low[0x20]; - - u8 time_to_boot_image_start[0x20]; - - u8 time_to_link_image[0x20]; - - u8 calibration_time[0x20]; - - u8 time_to_first_perst[0x20]; - - u8 time_to_detect_state[0x20]; - - u8 time_to_l0[0x20]; - - u8 time_to_crs_en[0x20]; - - u8 time_to_plastic_image_start[0x20]; - - u8 time_to_iron_image_start[0x20]; - - u8 perst_handler[0x20]; - - u8 times_in_l1[0x20]; - - u8 times_in_l23[0x20]; - - u8 dl_down[0x20]; - - u8 config_cycle1usec[0x20]; - - u8 config_cycle2to7usec[0x20]; - - u8 config_cycle_8to15usec[0x20]; - - u8 config_cycle_16_to_63usec[0x20]; - - u8 config_cycle_64usec[0x20]; - - u8 correctable_err_msg_sent[0x20]; - - u8 non_fatal_err_msg_sent[0x20]; - - u8 fatal_err_msg_sent[0x20]; - - u8 reserved_at_2e0[0x4e0]; -}; - struct mlx5_ifc_cmd_inter_comp_event_bits { u8 command_completion_vector[0x20]; @@ -2995,12 +2921,6 @@ union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits { u8 reserved_at_0[0x7c0]; }; -union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits { - struct mlx5_ifc_pcie_perf_cntrs_grp_data_layout_bits pcie_perf_cntrs_grp_data_layout; - struct mlx5_ifc_pcie_tas_cntrs_grp_data_layout_bits pcie_tas_cntrs_grp_data_layout; - u8 reserved_at_0[0x7c0]; -}; - union mlx5_ifc_event_auto_bits { struct mlx5_ifc_comp_event_bits comp_event; struct mlx5_ifc_dct_events_bits dct_events; @@ -7320,18 +7240,6 @@ struct mlx5_ifc_ppcnt_reg_bits { union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set; }; -struct mlx5_ifc_mpcnt_reg_bits { - u8 reserved_at_0[0x8]; - u8 pcie_index[0x8]; - u8 reserved_at_10[0xa]; - u8 grp[0x6]; - - u8 clr[0x1]; - u8 reserved_at_21[0x1f]; - - union mlx5_ifc_pcie_cntrs_grp_data_layout_auto_bits counter_set; -}; - struct mlx5_ifc_ppad_reg_bits { u8 reserved_at_0[0x3]; u8 single_mac[0x1]; @@ -7937,7 +7845,6 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_pmtu_reg_bits pmtu_reg; struct mlx5_ifc_ppad_reg_bits ppad_reg; struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg; - struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg; struct mlx5_ifc_pplm_reg_bits pplm_reg; struct mlx5_ifc_pplr_reg_bits pplr_reg; struct mlx5_ifc_ppsc_reg_bits ppsc_reg; -- cgit v1.2.3 From 10b1c04e92229ebeb38ccd0dcf2b6d3ec73c0575 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 29 Dec 2016 18:37:13 +0200 Subject: net/mlx4_core: Fix raw qp flow steering rules under SRIOV Demoting simple flow steering rule priority (for DPDK) was achieved by wrapping FW commands MLX4_QP_FLOW_STEERING_ATTACH/DETACH for the PF as well, and forcing the priority to MLX4_DOMAIN_NIC in the wrapper function for the PF and all VFs. In function mlx4_ib_create_flow(), this change caused the main rule creation for the PF to be wrapped, while it left the associated tunnel steering rule creation unwrapped for the PF. This mismatch caused rule deletion failures in mlx4_ib_destroy_flow() for the PF when the detach wrapper function did not find the associated tunnel-steering rule (since creation of that rule for the PF did not go through the wrapper function). Fix this by setting MLX4_QP_FLOW_STEERING_ATTACH/DETACH to be "native" (so that the PF invocation does not go through the wrapper), and perform the required priority demotion for the PF in the mlx4_ib_create_flow() code path. Fixes: 48564135cba8 ("net/mlx4_core: Demote simple multicast and broadcast flow steering rules") Signed-off-by: Jack Morgenstein Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 93bdb3485192..6533c16e27ad 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1384,6 +1384,8 @@ int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val); int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv); int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port, bool *vlan_offload_disabled); +void mlx4_handle_eth_header_mcast_prio(struct mlx4_net_trans_rule_hw_ctrl *ctrl, + struct _rule_hw *eth_header); int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); -- cgit v1.2.3 From a0c10687ec9506b5e14fe3dd47832a77f2f2500c Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Fri, 30 Dec 2016 03:21:38 -0800 Subject: Revert "remoteproc: Merge table_ptr and cached_table pointers" Following any fw_rsc_vdev entries in the resource table are two variable length arrays, the first one reference vring resources and the second one is the virtio config space. The virtio config space is used by virtio to communicate status and configuration changes and must as such be shared with the remote. The reverted commit incorrectly made any changes to the virtio config space only affect the local copy, in an attempt to allowing memory protection of the shared resource table. This reverts commit cda8529346935fc86f476999ac4fbfe4e17abf11. Signed-off-by: Bjorn Andersson --- include/linux/remoteproc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index e2f3a3281d8f..8265d351c9f0 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -408,7 +408,8 @@ enum rproc_crash_type { * @crash_comp: completion used to sync crash handler and the rproc reload * @recovery_disabled: flag that state if recovery was disabled * @max_notifyid: largest allocated notify id. - * @table_ptr: our copy of the resource table + * @table_ptr: pointer to the resource table in effect + * @cached_table: copy of the resource table * @has_iommu: flag to indicate if remote processor is behind an MMU */ struct rproc { @@ -440,6 +441,7 @@ struct rproc { bool recovery_disabled; int max_notifyid; struct resource_table *table_ptr; + struct resource_table *cached_table; bool has_iommu; bool auto_boot; }; -- cgit v1.2.3 From 42930553a7c11f06351bc08b889808d0f6020f08 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:38 -0700 Subject: vfio-mdev: de-polute the namespace, rename parent_device & parent_ops Add an mdev_ prefix so we're not poluting the namespace so much. Cc: Zhenyu Wang Cc: Zhi Wang Cc: Jike Song Signed-off-by: Alex Williamson Reviewed by: Kirti Wankhede --- include/linux/mdev.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index ec819e9a115a..853bb78e5866 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -14,9 +14,9 @@ #define MDEV_H /* Parent device */ -struct parent_device { - struct device *dev; - const struct parent_ops *ops; +struct mdev_parent { + struct device *dev; + const struct mdev_parent_ops *ops; /* internal */ struct kref ref; @@ -29,7 +29,7 @@ struct parent_device { /* Mediated device */ struct mdev_device { struct device dev; - struct parent_device *parent; + struct mdev_parent *parent; uuid_le uuid; void *driver_data; @@ -40,7 +40,7 @@ struct mdev_device { }; /** - * struct parent_ops - Structure to be registered for each parent device to + * struct mdev_parent_ops - Structure to be registered for each parent device to * register the device to mdev module. * * @owner: The module owner. @@ -86,10 +86,10 @@ struct mdev_device { * @mdev: mediated device structure * @vma: vma structure * Parent device that support mediated device should be registered with mdev - * module with parent_ops structure. + * module with mdev_parent_ops structure. **/ -struct parent_ops { +struct mdev_parent_ops { struct module *owner; const struct attribute_group **dev_attr_groups; const struct attribute_group **mdev_attr_groups; @@ -159,7 +159,7 @@ extern struct bus_type mdev_bus_type; #define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) extern int mdev_register_device(struct device *dev, - const struct parent_ops *ops); + const struct mdev_parent_ops *ops); extern void mdev_unregister_device(struct device *dev); extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner); -- cgit v1.2.3 From 9372e6feaafb65d88f667ffb5b7b425f8568344f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:41 -0700 Subject: vfio-mdev: Make mdev_parent private Rather than hoping for good behavior by marking some elements internal, enforce it by making the entire structure private and creating an accessor function for the one useful external field. Cc: Zhenyu Wang Cc: Zhi Wang Cc: Jike Song Signed-off-by: Alex Williamson Reviewed by: Kirti Wankhede --- include/linux/mdev.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 853bb78e5866..f586222b6c25 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -13,19 +13,6 @@ #ifndef MDEV_H #define MDEV_H -/* Parent device */ -struct mdev_parent { - struct device *dev; - const struct mdev_parent_ops *ops; - - /* internal */ - struct kref ref; - struct mutex lock; - struct list_head next; - struct kset *mdev_types_kset; - struct list_head type_list; -}; - /* Mediated device */ struct mdev_device { struct device dev; @@ -165,4 +152,6 @@ extern void mdev_unregister_device(struct device *dev); extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner); extern void mdev_unregister_driver(struct mdev_driver *drv); +extern struct device *mdev_parent_dev(struct mdev_device *mdev); + #endif /* MDEV_H */ -- cgit v1.2.3 From 99e3123e3d72616a829dad6d25aa005ef1ef9b13 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:44 -0700 Subject: vfio-mdev: Make mdev_device private and abstract interfaces Abstract access to mdev_device so that we can define which interfaces are public rather than relying on comments in the structure. Cc: Zhenyu Wang Cc: Zhi Wang Signed-off-by: Alex Williamson Reviewed-by: Jike Song Reviewed by: Kirti Wankhede --- include/linux/mdev.h | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index f586222b6c25..3ee44b8d2bb3 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -13,18 +13,7 @@ #ifndef MDEV_H #define MDEV_H -/* Mediated device */ -struct mdev_device { - struct device dev; - struct mdev_parent *parent; - uuid_le uuid; - void *driver_data; - - /* internal */ - struct kref ref; - struct list_head next; - struct kobject *type_kobj; -}; +struct mdev_device; /** * struct mdev_parent_ops - Structure to be registered for each parent device to @@ -75,7 +64,6 @@ struct mdev_device { * Parent device that support mediated device should be registered with mdev * module with mdev_parent_ops structure. **/ - struct mdev_parent_ops { struct module *owner; const struct attribute_group **dev_attr_groups; @@ -129,22 +117,13 @@ struct mdev_driver { }; #define to_mdev_driver(drv) container_of(drv, struct mdev_driver, driver) -#define to_mdev_device(dev) container_of(dev, struct mdev_device, dev) -static inline void *mdev_get_drvdata(struct mdev_device *mdev) -{ - return mdev->driver_data; -} - -static inline void mdev_set_drvdata(struct mdev_device *mdev, void *data) -{ - mdev->driver_data = data; -} +extern void *mdev_get_drvdata(struct mdev_device *mdev); +extern void mdev_set_drvdata(struct mdev_device *mdev, void *data); +extern uuid_le mdev_uuid(struct mdev_device *mdev); extern struct bus_type mdev_bus_type; -#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) - extern int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops); extern void mdev_unregister_device(struct device *dev); @@ -153,5 +132,7 @@ extern int mdev_register_driver(struct mdev_driver *drv, struct module *owner); extern void mdev_unregister_driver(struct mdev_driver *drv); extern struct device *mdev_parent_dev(struct mdev_device *mdev); +extern struct device *mdev_dev(struct mdev_device *mdev); +extern struct mdev_device *mdev_from_dev(struct device *dev); #endif /* MDEV_H */ -- cgit v1.2.3 From 65e4345c8ef8811bbb4860fe5f2df10646b7f2e1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 30 Dec 2016 23:54:18 +0100 Subject: iio: accel: st_accel: fix LIS3LV02 reading and scaling The LIS3LV02 has a special bit that need to be set to get the read values left aligned. Before this patch we get gibberish like this: iio_generic_buffer -a -c10 -n lis3lv02dl_accel (...) 0.000000 -0.010042 -0.642688 19155832931907 0.000000 -0.010042 -0.642688 19155858751073 Which is because we read a raw value for 1g as 64 which is the nominal 1024 for 1g shifted 4 bits to the left by being right-aligned rather than left aligned. Since all other sensors are left aligned, add some code to set the special DAS (data alignment setting) bit to 1 so that the right value is now read like this: iio_generic_buffer -a -c10 -n lis3lv02dl_accel (...) 0.000000 -0.147095 -10.120135 24761614364956 -0.029419 -0.176514 -10.120135 24761631624540 The scaling was weird as well: we have a gain of 1000 for 1g and 3000 for 6g. I don't even remember how I came up with the old values but they are wrong. Fixes: 3acddf74f807 ("iio: st-sensors: add support for lis3lv02d accelerometer") Cc: Lorenzo Bianconi Cc: Giuseppe Barba Cc: Denis Ciocca Signed-off-by: Linus Walleij Signed-off-by: Jonathan Cameron --- include/linux/iio/common/st_sensors.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h index 228bd44efa4c..497f2b3a5a62 100644 --- a/include/linux/iio/common/st_sensors.h +++ b/include/linux/iio/common/st_sensors.h @@ -115,6 +115,16 @@ struct st_sensor_bdu { u8 mask; }; +/** + * struct st_sensor_das - ST sensor device data alignment selection + * @addr: address of the register. + * @mask: mask to write the das flag for left alignment. + */ +struct st_sensor_das { + u8 addr; + u8 mask; +}; + /** * struct st_sensor_data_ready_irq - ST sensor device data-ready interrupt * @addr: address of the register. @@ -185,6 +195,7 @@ struct st_sensor_transfer_function { * @enable_axis: Enable one or more axis of the sensor. * @fs: Full scale register and full scale list available. * @bdu: Block data update register. + * @das: Data Alignment Selection register. * @drdy_irq: Data ready register of the sensor. * @multi_read_bit: Use or not particular bit for [I2C/SPI] multi-read. * @bootime: samples to discard when sensor passing from power-down to power-up. @@ -200,6 +211,7 @@ struct st_sensor_settings { struct st_sensor_axis enable_axis; struct st_sensor_fullscale fs; struct st_sensor_bdu bdu; + struct st_sensor_das das; struct st_sensor_data_ready_irq drdy_irq; bool multi_read_bit; unsigned int bootime; -- cgit v1.2.3 From c6ef7fd40eddad38a8825cbd6bb2ce8bdbba88f5 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 4 Jan 2017 15:08:15 -0500 Subject: vfio-mdev: fix non-standard ioctl return val causing i386 build fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit What appears to be a copy and paste error from the line above gets the ioctl a ssize_t return value instead of the traditional "int". The associated sample code used "long" which meant it would compile for x86-64 but not i386, with the latter failing as follows: CC [M] samples/vfio-mdev/mtty.o samples/vfio-mdev/mtty.c:1418:20: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types] .ioctl = mtty_ioctl, ^ samples/vfio-mdev/mtty.c:1418:20: note: (near initialization for ‘mdev_fops.ioctl’) cc1: some warnings being treated as errors Since in this case, vfio is working with struct file_operations; as such: long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); ...and so here we just standardize on long vs. the normal int that user space typically sees and documents as per "man ioctl" and similar. Fixes: 9d1a546c53b4 ("docs: Sample driver to demonstrate how to use Mediated device framework.") Cc: Kirti Wankhede Cc: Neo Jia Cc: kvm@vger.kernel.org Signed-off-by: Paul Gortmaker Signed-off-by: Alex Williamson --- include/linux/mdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 3ee44b8d2bb3..b6e048e1045f 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -78,7 +78,7 @@ struct mdev_parent_ops { size_t count, loff_t *ppos); ssize_t (*write)(struct mdev_device *mdev, const char __user *buf, size_t count, loff_t *ppos); - ssize_t (*ioctl)(struct mdev_device *mdev, unsigned int cmd, + long (*ioctl)(struct mdev_device *mdev, unsigned int cmd, unsigned long arg); int (*mmap)(struct mdev_device *mdev, struct vm_area_struct *vma); }; -- cgit v1.2.3 From 7453c549f5f6485c0d79cad7844870dcc7d1b34d Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 20 Dec 2016 10:02:02 -0500 Subject: swiotlb: Export swiotlb_max_segment to users So they can figure out what is the optimal number of pages that can be contingously stitched together without fear of bounce buffer. We also expose an mechanism for sub-users of SWIOTLB API, such as Xen-SWIOTLB to set the max segment value. And lastly if swiotlb=force is set (which mandates we bounce buffer everything) we set max_segment so at least we can bounce buffer one 4K page instead of a giant 512KB one for which we may not have space. Signed-off-by: Konrad Rzeszutek Wilk Reported-and-Tested-by: Juergen Gross --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index d9c84a9cde3d..4ee479f2f355 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -114,11 +114,14 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask); #ifdef CONFIG_SWIOTLB extern void __init swiotlb_free(void); +unsigned int swiotlb_max_segment(void); #else static inline void swiotlb_free(void) { } +static inline unsigned int swiotlb_max_segment(void) { return 0; } #endif extern void swiotlb_print_info(void); extern int is_swiotlb_buffer(phys_addr_t paddr); +extern void swiotlb_set_max_segment(unsigned int); #endif /* __LINUX_SWIOTLB_H */ -- cgit v1.2.3 From 20b1e22d01a4b0b11d3a1066e9feb04be38607ec Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 5 Jan 2017 13:51:29 +0100 Subject: x86/efi: Don't allocate memmap through memblock after mm_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the following commit: 4bc9f92e64c8 ("x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data") ... efi_bgrt_init() calls into the memblock allocator through efi_mem_reserve() => efi_arch_mem_reserve() *after* mm_init() has been called. Indeed, KASAN reports a bad read access later on in efi_free_boot_services(): BUG: KASAN: use-after-free in efi_free_boot_services+0xae/0x24c at addr ffff88022de12740 Read of size 4 by task swapper/0/0 page:ffffea0008b78480 count:0 mapcount:-127 mapping: (null) index:0x1 flags: 0x5fff8000000000() [...] Call Trace: dump_stack+0x68/0x9f kasan_report_error+0x4c8/0x500 kasan_report+0x58/0x60 __asan_load4+0x61/0x80 efi_free_boot_services+0xae/0x24c start_kernel+0x527/0x562 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x157/0x17a start_cpu+0x5/0x14 The instruction at the given address is the first read from the memmap's memory, i.e. the read of md->type in efi_free_boot_services(). Note that the writes earlier in efi_arch_mem_reserve() don't splat because they're done through early_memremap()ed addresses. So, after memblock is gone, allocations should be done through the "normal" page allocator. Introduce a helper, efi_memmap_alloc() for this. Use it from efi_arch_mem_reserve(), efi_free_boot_services() and, for the sake of consistency, from efi_fake_memmap() as well. Note that for the latter, the memmap allocations cease to be page aligned. This isn't needed though. Tested-by: Dan Williams Signed-off-by: Nicolai Stange Reviewed-by: Ard Biesheuvel Cc: # v4.9 Cc: Dave Young Cc: Linus Torvalds Cc: Matt Fleming Cc: Mika Penttilä Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Fixes: 4bc9f92e64c8 ("x86/efi-bgrt: Use efi_mem_reserve() to avoid copying image data") Link: http://lkml.kernel.org/r/20170105125130.2815-1-nicstange@gmail.com Signed-off-by: Ingo Molnar --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index a07a476178cd..0c5420208c40 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -950,6 +950,7 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, #endif extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr); +extern phys_addr_t __init efi_memmap_alloc(unsigned int num_entries); extern int __init efi_memmap_init_early(struct efi_memory_map_data *data); extern int __init efi_memmap_init_late(phys_addr_t addr, unsigned long size); extern void __init efi_memmap_unmap(void); -- cgit v1.2.3 From ea07b862ac8ef9b8c8358517d2e39f847dda6659 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 6 Jan 2017 19:21:43 -0500 Subject: mm: workingset: fix use-after-free in shadow node shrinker Several people report seeing warnings about inconsistent radix tree nodes followed by crashes in the workingset code, which all looked like use-after-free access from the shadow node shrinker. Dave Jones managed to reproduce the issue with a debug patch applied, which confirmed that the radix tree shrinking indeed frees shadow nodes while they are still linked to the shadow LRU: WARNING: CPU: 2 PID: 53 at lib/radix-tree.c:643 delete_node+0x1e4/0x200 CPU: 2 PID: 53 Comm: kswapd0 Not tainted 4.10.0-rc2-think+ #3 Call Trace: delete_node+0x1e4/0x200 __radix_tree_delete_node+0xd/0x10 shadow_lru_isolate+0xe6/0x220 __list_lru_walk_one.isra.4+0x9b/0x190 list_lru_walk_one+0x23/0x30 scan_shadow_nodes+0x2e/0x40 shrink_slab.part.44+0x23d/0x5d0 shrink_node+0x22c/0x330 kswapd+0x392/0x8f0 This is the WARN_ON_ONCE(!list_empty(&node->private_list)) placed in the inlined radix_tree_shrink(). The problem is with 14b468791fa9 ("mm: workingset: move shadow entry tracking to radix tree exceptional tracking"), which passes an update callback into the radix tree to link and unlink shadow leaf nodes when tree entries change, but forgot to pass the callback when reclaiming a shadow node. While the reclaimed shadow node itself is unlinked by the shrinker, its deletion from the tree can cause the left-most leaf node in the tree to be shrunk. If that happens to be a shadow node as well, we don't unlink it from the LRU as we should. Consider this tree, where the s are shadow entries: root->rnode | [0 n] | | [s ] [sssss] Now the shadow node shrinker reclaims the rightmost leaf node through the shadow node LRU: root->rnode | [0 ] | [s ] Because the parent of the deleted node is the first level below the root and has only one child in the left-most slot, the intermediate level is shrunk and the node containing the single shadow is put in its place: root->rnode | [s ] The shrinker again sees a single left-most slot in a first level node and thus decides to store the shadow in root->rnode directly and free the node - which is a leaf node on the shadow node LRU. root->rnode | s Without the update callback, the freed node remains on the shadow LRU, where it causes later shrinker runs to crash. Pass the node updater callback into __radix_tree_delete_node() in case the deletion causes the left-most branch in the tree to collapse too. Also add warnings when linked nodes are freed right away, rather than wait for the use-after-free when the list is scanned much later. Fixes: 14b468791fa9 ("mm: workingset: move shadow entry tracking to radix tree exceptional tracking") Reported-by: Dave Chinner Reported-by: Hugh Dickins Reported-by: Andrea Arcangeli Reported-and-tested-by: Dave Jones Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Cc: Chris Leech Cc: Lee Duncan Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 5dea8f6440e4..52bda854593b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -306,7 +306,9 @@ void radix_tree_iter_replace(struct radix_tree_root *, void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); void __radix_tree_delete_node(struct radix_tree_root *root, - struct radix_tree_node *node); + struct radix_tree_node *node, + radix_tree_update_node_t update_node, + void *private); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); void radix_tree_clear_tags(struct radix_tree_root *root, -- cgit v1.2.3 From 57ea52a865144aedbcd619ee0081155e658b6f7d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:15 -0800 Subject: gro: Disable frag0 optimization on IPv6 ext headers The GRO fast path caches the frag0 address. This address becomes invalid if frag0 is modified by pskb_may_pull or its variants. So whenever that happens we must disable the frag0 optimization. This is usually done through the combination of gro_header_hard and gro_header_slow, however, the IPv6 extension header path did the pulling directly and would continue to use the GRO fast path incorrectly. This patch fixes it by disabling the fast path when we enter the IPv6 extension header path. Fixes: 78a478d0efd9 ("gro: Inline skb_gro_header and cache frag0 virtual address") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 994f7423a74b..9bde9558b596 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2477,14 +2477,19 @@ static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) return NAPI_GRO_CB(skb)->frag0_len < hlen; } +static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; +} + static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + skb_gro_frag0_invalidate(skb); return skb->data + offset; } -- cgit v1.2.3 From 097963959594c5eccaba42510f7033f703211bda Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:21 -0800 Subject: mm: add follow_pte_pmd() Patch series "Write protect DAX PMDs in *sync path". Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss, as detailed in patch 2. This series is based on Dan's "libnvdimm-pending" branch, which is the current home for Jan's "dax: Page invalidation fixes" series. You can find a working tree here: https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean This patch (of 2): Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a huge page PMD leaf to be found and returned. Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Dave Hansen Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fe6b4036664a..02793ac64ac6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1212,6 +1212,8 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, -- cgit v1.2.3 From f729c8c9b24f0540a6e6b86e68f3888ba90ef7e7 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:24 -0800 Subject: dax: wrprotect pmd_t in dax_mapping_entry_mkclean Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss in the following sequence: 1) mmap write to DAX PMD, dirtying PMD radix tree entry and making the pmd_t dirty and writeable 2) fsync, flushing out PMD data and cleaning the radix tree entry. We currently fail to mark the pmd_t as clean and write protected. 3) more mmap writes to the PMD. These don't cause any page faults since the pmd_t is dirty and writeable. The radix tree entry remains clean. 4) fsync, which fails to flush the dirty PMD data because the radix tree entry was clean. 5) crash - dirty data that should have been fsync'd as part of 4) could still have been in the processor cache, and is lost. Fix this by marking the pmd_t clean and write protected in dax_mapping_entry_mkclean(), which is called as part of the fsync operation 2). This will cause the writes in step 3) above to generate page faults where we'll re-dirty the PMD radix tree entry, resulting in flushes in the fsync that happens in step 4). Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush") Link: http://lkml.kernel.org/r/1482272586-21177-3-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 02793ac64ac6..b84615b0f64c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,8 +1210,6 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); -int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, - spinlock_t **ptlp); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, -- cgit v1.2.3 From bb1107f7c6052c863692a41f78c000db792334bf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:27 -0800 Subject: mm, slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER Andrey Konovalov has reported the following warning triggered by the syzkaller fuzzer. WARNING: CPU: 1 PID: 9935 at mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 9935 Comm: syz-executor0 Not tainted 4.9.0-rc7+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __alloc_pages_slowpath mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 mm/page_alloc.c:3781 alloc_pages_current+0x1c7/0x6b0 mm/mempolicy.c:2072 alloc_pages include/linux/gfp.h:469 kmalloc_order+0x1f/0x70 mm/slab_common.c:1015 kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026 kmalloc_large include/linux/slab.h:422 __kmalloc+0x210/0x2d0 mm/slub.c:3723 kmalloc include/linux/slab.h:495 ep_write_iter+0x167/0xb50 drivers/usb/gadget/legacy/inode.c:664 new_sync_write fs/read_write.c:499 __vfs_write+0x483/0x760 fs/read_write.c:512 vfs_write+0x170/0x4e0 fs/read_write.c:560 SYSC_write fs/read_write.c:607 SyS_write+0xfb/0x230 fs/read_write.c:599 entry_SYSCALL_64_fastpath+0x1f/0xc2 The issue is caused by a lack of size check for the request size in ep_write_iter which should be fixed. It, however, points to another problem, that SLUB defines KMALLOC_MAX_SIZE too large because the its KMALLOC_SHIFT_MAX is (MAX_ORDER + PAGE_SHIFT) which means that the resulting page allocator request might be MAX_ORDER which is too large (see __alloc_pages_slowpath). The same applies to the SLOB allocator which allows even larger sizes. Make sure that they are capped properly and never request more than MAX_ORDER order. Link: http://lkml.kernel.org/r/20161220130659.16461-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Andrey Konovalov Acked-by: Christoph Lameter Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 084b12bad198..4c5363566815 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -226,7 +226,7 @@ static inline const char *__check_heap_object(const void *ptr, * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -239,7 +239,7 @@ static inline const char *__check_heap_object(const void *ptr, * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX 30 +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -- cgit v1.2.3 From 41b6167e8f746b475668f1da78599fc4284f18db Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:42 -0800 Subject: mm: get rid of __GFP_OTHER_NODE The flag was introduced by commit 78afd5612deb ("mm: add __GFP_OTHER_NODE flag") to allow proper accounting of remote node allocations done by kernel daemons on behalf of a process - e.g. khugepaged. After "mm: fix remote numa hits statistics" we do not need and actually use the flag so we can safely remove it because all allocations which are satisfied from their "home" node are accounted properly. [mhocko@suse.com: fix build] Link: http://lkml.kernel.org/r/20170106122225.GK5556@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170102153057.9451-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4175dca4ac39..7806a8f80abc 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -38,9 +38,8 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u -#define ___GFP_OTHER_NODE 0x800000u -#define ___GFP_WRITE 0x1000000u -#define ___GFP_KSWAPD_RECLAIM 0x2000000u +#define ___GFP_WRITE 0x800000u +#define ___GFP_KSWAPD_RECLAIM 0x1000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -172,11 +171,6 @@ struct vm_area_struct; * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of * distinguishing in the source between false positives and allocations that * cannot be supported (e.g. page tables). - * - * __GFP_OTHER_NODE is for allocations that are on a remote node but that - * should not be accounted for as a remote allocation in vmstat. A - * typical user would be khugepaged collapsing a huge page on a remote - * node. */ #define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) @@ -184,10 +178,9 @@ struct vm_area_struct; #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 26 +#define __GFP_BITS_SHIFT 25 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* -- cgit v1.2.3 From 2d39b3cd34e6d323720d4c61bd714f5ae202c022 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 10 Jan 2017 16:57:54 -0800 Subject: signal: protect SIGNAL_UNKILLABLE from unintentional clearing. Since commit 00cd5c37afd5 ("ptrace: permit ptracing of /sbin/init") we can now trace init processes. init is initially protected with SIGNAL_UNKILLABLE which will prevent fatal signals such as SIGSTOP, but there are a number of paths during tracing where SIGNAL_UNKILLABLE can be implicitly cleared. This can result in init becoming stoppable/killable after tracing. For example, running: while true; do kill -STOP 1; done & strace -p 1 and then stopping strace and the kill loop will result in init being left in state TASK_STOPPED. Sending SIGCONT to init will resume it, but init will now respond to future SIGSTOP signals rather than ignoring them. Make sure that when setting SIGNAL_STOP_CONTINUED/SIGNAL_STOP_STOPPED that we don't clear SIGNAL_UNKILLABLE. Link: http://lkml.kernel.org/r/20170104122017.25047-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Cc: Alexander Viro Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d1905245c7a..ad3ec9ec61f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -854,6 +854,16 @@ struct signal_struct { #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { -- cgit v1.2.3 From b4536f0c829c8586544c94735c343f9b5070bd01 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:58:04 -0800 Subject: mm, memcg: fix the active list aging for lowmem requests when memcg is enabled Nils Holland and Klaus Ethgen have reported unexpected OOM killer invocations with 32b kernel starting with 4.8 kernels kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0 kworker/u4:5 cpuset=/ mems_allowed=0 CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2 [...] Mem-Info: active_anon:58685 inactive_anon:90 isolated_anon:0 active_file:274324 inactive_file:281962 isolated_file:0 unevictable:0 dirty:649 writeback:0 unstable:0 slab_reclaimable:40662 slab_unreclaimable:17754 mapped:7382 shmem:202 pagetables:351 bounce:0 free:206736 free_pcp:332 free_cma:0 Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 813 3474 3474 Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB lowmem_reserve[]: 0 0 21292 21292 HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB the oom killer is clearly pre-mature because there there is still a lot of page cache in the zone Normal which should satisfy this lowmem request. Further debugging has shown that the reclaim cannot make any forward progress because the page cache is hidden in the active list which doesn't get rotated because inactive_list_is_low is not memcg aware. The code simply subtracts per-zone highmem counters from the respective memcg's lru sizes which doesn't make any sense. We can simply end up always seeing the resulting active and inactive counts 0 and return false. This issue is not limited to 32b kernels but in practice the effect on systems without CONFIG_HIGHMEM would be much harder to notice because we do not invoke the OOM killer for allocations requests targeting < ZONE_NORMAL. Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node and subtract per-memcg highmem counts when memcg is enabled. Introduce helper lruvec_zone_lru_size which redirects to either zone counters or mem_cgroup_get_zone_lru_size when appropriate. We are losing empty LRU but non-zero lru size detection introduced by ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because of the inherent zone vs. node discrepancy. Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio") Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Nils Holland Tested-by: Nils Holland Reported-by: Klaus Ethgen Acked-by: Minchan Kim Acked-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 26 +++++++++++++++++++++++--- include/linux/mm_inline.h | 2 +- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 61d20c17f3b7..254698856b8f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter { */ struct mem_cgroup_per_node { struct lruvec lruvec; - unsigned long lru_size[NR_LRU_LISTS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages); + int zid, int nr_pages); unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); @@ -441,9 +441,23 @@ static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_node *mz; + unsigned long nr_pages = 0; + int zid; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return mz->lru_size[lru]; + for (zid = 0; zid < MAX_NR_ZONES; zid++) + nr_pages += mz->lru_zone_size[zid][lru]; + return nr_pages; +} + +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + struct mem_cgroup_per_node *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->lru_zone_size[zone_idx][lru]; } void mem_cgroup_handle_over_high(void); @@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { return 0; } +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + return 0; +} static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 71613e8a720f..41d376e7116d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } -- cgit v1.2.3 From 8c2dd3e4a4bae78093c4a5cee6494877651be3c9 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:06 -0800 Subject: mm: rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free Patch series "Page fragment updates", v4. This patch series takes care of a few cleanups for the page fragments API. First we do some renames so that things are much more consistent. First we move the page_frag_ portion of the name to the front of the functions names. Secondly we split out the cache specific functions from the other page fragment functions by adding the word "cache" to the name. Finally I added a bit of documentation that will hopefully help to explain some of this. I plan to revisit this later as we get things more ironed out in the near future with the changes planned for the DMA setup to support eXpress Data Path. This patch (of 3): This patch renames the page frag functions to be more consistent with other APIs. Specifically we place the name page_frag first in the name and then have either an alloc or free call name that we append as the suffix. This makes it a bit clearer in terms of naming. In addition we drop the leading double underscores since we are technically no longer a backing interface and instead the front end that is called from the networking APIs. Link: http://lkml.kernel.org/r/20170104023854.13451.67390.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 +++--- include/linux/skbuff.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7806a8f80abc..ed77a86fbbb0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -501,9 +501,9 @@ extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; extern void __page_frag_drain(struct page *page, unsigned int order, unsigned int count); -extern void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask); -extern void __free_page_frag(void *addr); +extern void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask); +extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b53c0cfd417e..a410715bbef8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2480,7 +2480,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, static inline void skb_free_frag(void *addr) { - __free_page_frag(addr); + page_frag_free(addr); } void *napi_alloc_frag(unsigned int fragsz); -- cgit v1.2.3 From 2976db8018532b624c4123ae662fbc0814877abf Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:09 -0800 Subject: mm: rename __page_frag functions to __page_frag_cache, drop order from drain This patch does two things. First it goes through and renames the __page_frag prefixed functions to __page_frag_cache so that we can be clear that we are draining or refilling the cache, not the frags themselves. Second we drop the order parameter from __page_frag_cache_drain since we don't actually need to pass it since all fragments are either order 0 or must be a compound page. Link: http://lkml.kernel.org/r/20170104023954.13451.5678.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ed77a86fbbb0..0fe0b6295ab5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -499,8 +499,7 @@ extern void free_hot_cold_page(struct page *page, bool cold); extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; -extern void __page_frag_drain(struct page *page, unsigned int order, - unsigned int count); +extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void page_frag_free(void *addr); -- cgit v1.2.3 From f05714293a591038304ddae7cb0dd747bb3786cc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:15 -0800 Subject: mm: support anonymous stable page During developemnt for zram-swap asynchronous writeback, I found strange corruption of compressed page, resulting in: Modules linked in: zram(E) CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G E 4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff88007620b840 task.stack: ffff880078090000 RIP: set_freeobj.part.43+0x1c/0x1f RSP: 0018:ffff880078093ca8 EFLAGS: 00010246 RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8 RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246 RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88 R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0 Call Trace: obj_malloc+0x22b/0x260 zs_malloc+0x1e4/0x580 zram_bvec_rw+0x4cd/0x830 [zram] page_requests_rw+0x9c/0x130 [zram] zram_thread+0xe6/0x173 [zram] kthread+0xca/0xe0 ret_from_fork+0x25/0x30 With investigation, it reveals currently stable page doesn't support anonymous page. IOW, reuse_swap_page can reuse the page without waiting writeback completion so it can overwrite page zram is compressing. Unfortunately, zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true unless stable page supports. So, If the data is changed under us, zram can make buffer overrun because second compression size could be bigger than one we got in previous trial and blindly, copy bigger size object to smaller buffer which is buffer overrun. The overrun breaks zsmalloc free object chaining so system goes crash like above. I think below is same problem. https://bugzilla.suse.com/show_bug.cgi?id=997574 Unfortunately, reuse_swap_page should be atomic so that we cannot wait on writeback in there so the approach in this patch is simply return false if we found it needs stable page. Although it increases memory footprint temporarily, it happens rarely and it should be reclaimed easily althoug it happened. Also, It would be better than waiting of IO completion, which is critial path for application latency. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Cc: Sergey Senozhatsky Cc: Darrick J. Wong Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 09f4be179ff3..7f47b7098b1b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -150,8 +150,9 @@ enum { SWP_FILE = (1 << 7), /* set after swap_activate success */ SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ /* add others here before... */ - SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL -- cgit v1.2.3 From 575b1967e10a1f3038266244d2c7a3ca6b99fed8 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 10 Jan 2017 16:58:30 -0800 Subject: timerfd: export defines to userspace Since userspace is expected to call timerfd syscalls directly with these flags/ioctls, make sure we export them so they don't have to duplicate the values themselves. Link: http://lkml.kernel.org/r/20161219064052.7196-1-vapier@gentoo.org Signed-off-by: Mike Frysinger Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timerfd.h | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index bd36ce431e32..bab0b1ad0613 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -8,23 +8,7 @@ #ifndef _LINUX_TIMERFD_H #define _LINUX_TIMERFD_H -/* For O_CLOEXEC and O_NONBLOCK */ -#include - -/* For _IO helpers */ -#include - -/* - * CAREFUL: Check include/asm-generic/fcntl.h when defining - * new flags, since they might collide with O_* ones. We want - * to re-use O_* flags that couldn't possibly have a meaning - * from eventfd, in order to leave a free define-space for - * shared O_* flags. - */ -#define TFD_TIMER_ABSTIME (1 << 0) -#define TFD_TIMER_CANCEL_ON_SET (1 << 1) -#define TFD_CLOEXEC O_CLOEXEC -#define TFD_NONBLOCK O_NONBLOCK +#include #define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) /* Flags for timerfd_create. */ @@ -32,6 +16,4 @@ /* Flags for timerfd_settime. */ #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) -#define TFD_IOC_SET_TICKS _IOW('T', 0, u64) - #endif /* _LINUX_TIMERFD_H */ -- cgit v1.2.3 From b6416e61012429e0277bd15a229222fd17afc1c1 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 16 Dec 2016 14:30:35 -0800 Subject: jump_labels: API for flushing deferred jump label updates Modules that use static_key_deferred need a way to synchronize with any delayed work that is still pending when the module is unloaded. Introduce static_key_deferred_flush() which flushes any pending jump label updates. Signed-off-by: David Matlack Cc: stable@vger.kernel.org Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini --- include/linux/jump_label_ratelimit.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h index 089f70f83e97..23da3af459fe 100644 --- a/include/linux/jump_label_ratelimit.h +++ b/include/linux/jump_label_ratelimit.h @@ -14,6 +14,7 @@ struct static_key_deferred { #ifdef HAVE_JUMP_LABEL extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern void static_key_deferred_flush(struct static_key_deferred *key); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); @@ -26,6 +27,10 @@ static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) STATIC_KEY_CHECK_USE(); static_key_slow_dec(&key->key); } +static inline void static_key_deferred_flush(struct static_key_deferred *key) +{ + STATIC_KEY_CHECK_USE(); +} static inline void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) -- cgit v1.2.3 From f99e86485cc32cd16e5cc97f9bb0474f28608d84 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 12 Jan 2017 07:58:32 -0700 Subject: block: Rename blk_queue_zone_size and bdev_zone_size All block device data fields and functions returning a number of 512B sectors are by convention named xxx_sectors while names in the form xxx_size are generally used for a number of bytes. The blk_queue_zone_size and bdev_zone_size functions were not following this convention so rename them. No functional change is introduced by this patch. Signed-off-by: Damien Le Moal Collapsed the two patches, they were nonsensically split and broke bisection. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83695641bd5e..ff3d774f2751 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -739,7 +739,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } -static inline unsigned int blk_queue_zone_size(struct request_queue *q) +static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } @@ -1536,12 +1536,12 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } -static inline unsigned int bdev_zone_size(struct block_device *bdev) +static inline unsigned int bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) - return blk_queue_zone_size(q); + return blk_queue_zone_sectors(q); return 0; } -- cgit v1.2.3 From 331c34255293cd02d395b7097008b509ba89e60e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 4 Jan 2017 20:57:22 -0800 Subject: i2c: do not enable fall back to Host Notify by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Falling back unconditionally to HostNotify as primary client's interrupt breaks some drivers which alter their functionality depending on whether interrupt is present or not, so let's introduce a board flag telling I2C core explicitly if we want wired interrupt or HostNotify-based one: I2C_CLIENT_HOST_NOTIFY. For DT-based systems we introduce "host-notify" property that we convert to I2C_CLIENT_HOST_NOTIFY board flag. Tested-by: Benjamin Tissoires Signed-off-by: Dmitry Torokhov Acked-by: Pali Rohár Acked-by: Rob Herring Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index b2109c522dec..4b45ec46161f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -665,6 +665,7 @@ i2c_unlock_adapter(struct i2c_adapter *adapter) #define I2C_CLIENT_TEN 0x10 /* we have a ten bit chip address */ /* Must equal I2C_M_TEN below */ #define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ +#define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ #define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ #define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ /* Must match I2C_M_STOP|IGNORE_NAK */ -- cgit v1.2.3 From 546125d1614264d26080817d0c8cddb9b25081fa Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 5 Jan 2017 16:34:51 -0500 Subject: sunrpc: don't call sleeping functions from the notifier block callbacks The inet6addr_chain is an atomic notifier chain, so we can't call anything that might sleep (like lock_sock)... instead of closing the socket from svc_age_temp_xprts_now (which is called by the notifier function), just have the rpc service threads do it instead. Cc: stable@vger.kernel.org Fixes: c3d4879e01be "sunrpc: Add a function to close..." Signed-off-by: Scott Mayhew Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index e5d193440374..7440290f64ac 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -66,6 +66,7 @@ struct svc_xprt { #define XPT_LISTENER 10 /* listening endpoint */ #define XPT_CACHE_AUTH 11 /* cache auth info */ #define XPT_LOCAL 12 /* connection from loopback interface */ +#define XPT_KILL_TEMP 13 /* call xpo_kill_temp_xprt before closing */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ -- cgit v1.2.3 From 003c941057eaa868ca6fedd29a274c863167230d Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Jan 2017 14:24:58 -0800 Subject: tcp: fix tcp_fastopen unaligned access complaints on sparc Fix up a data alignment issue on sparc by swapping the order of the cookie byte array field with the length field in struct tcp_fastopen_cookie, and making it a proper union to clean up the typecasting. This addresses log complaints like these: log_unaligned: 113 callbacks suppressed Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360 Kernel unaligned access at TPC[9764ac] tcp_try_fastopen+0x2ec/0x360 Kernel unaligned access at TPC[9764c8] tcp_try_fastopen+0x308/0x360 Kernel unaligned access at TPC[9764e4] tcp_try_fastopen+0x324/0x360 Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360 Cc: Eric Dumazet Signed-off-by: Shannon Nelson Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fc5848dad7a4..c93f4b3a59cb 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -62,8 +62,13 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { + union { + u8 val[TCP_FASTOPEN_COOKIE_MAX]; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr addr; +#endif + }; s8 len; - u8 val[TCP_FASTOPEN_COOKIE_MAX]; bool exp; /* In RFC6994 experimental option format */ }; -- cgit v1.2.3 From 2e3258ecfaebace1ceffaa14e0ea94775d54f46f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 13 Jan 2017 12:29:10 +0100 Subject: block: add blk_rq_payload_bytes Add a helper to calculate the actual data transfer size for special payload requests. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ff3d774f2751..1ca8e8fd1078 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1000,6 +1000,19 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +/* + * Some commands like WRITE SAME have a payload or data transfer size which + * is different from the size of the request. Any driver that supports such + * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to + * calculate the data transfer size. + */ +static inline unsigned int blk_rq_payload_bytes(struct request *rq) +{ + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + return blk_rq_bytes(rq); +} + static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { -- cgit v1.2.3 From 475113d937adfd150eb82b5e2c5507125a68e7af Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 28 Dec 2016 14:31:03 +0100 Subject: perf/x86/intel: Account interrupts for PEBS errors It's possible to set up PEBS events to get only errors and not any data, like on SNB-X (model 45) and IVB-EP (model 62) via 2 perf commands running simultaneously: taskset -c 1 ./perf record -c 4 -e branches:pp -j any -C 10 This leads to a soft lock up, because the error path of the intel_pmu_drain_pebs_nhm() does not account event->hw.interrupt for error PEBS interrupts, so in case you're getting ONLY errors you don't have a way to stop the event when it's over the max_samples_per_tick limit: NMI watchdog: BUG: soft lockup - CPU#22 stuck for 22s! [perf_fuzzer:5816] ... RIP: 0010:[] [] smp_call_function_single+0xe2/0x140 ... Call Trace: ? trace_hardirqs_on_caller+0xf5/0x1b0 ? perf_cgroup_attach+0x70/0x70 perf_install_in_context+0x199/0x1b0 ? ctx_resched+0x90/0x90 SYSC_perf_event_open+0x641/0xf90 SyS_perf_event_open+0x9/0x10 do_syscall_64+0x6c/0x1f0 entry_SYSCALL64_slow_path+0x25/0x25 Add perf_event_account_interrupt() which does the interrupt and frequency checks and call it from intel_pmu_drain_pebs_nhm()'s error path. We keep the pending_kill and pending_wakeup logic only in the __perf_event_overflow() path, because they make sense only if there's any data to deliver. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: Vince Weaver Link: http://lkml.kernel.org/r/1482931866-6018-2-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4741ecdb9817..78ed8105e64d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1259,6 +1259,7 @@ extern void perf_event_disable(struct perf_event *event); extern void perf_event_disable_local(struct perf_event *event); extern void perf_event_disable_inatomic(struct perf_event *event); extern void perf_event_task_tick(void); +extern int perf_event_account_interrupt(struct perf_event *event); #else /* !CONFIG_PERF_EVENTS: */ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, -- cgit v1.2.3 From 0100a3e67a9cef64d72cd3a1da86f3ddbee50363 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Mon, 12 Dec 2016 18:42:28 -0500 Subject: efi/x86: Prune invalid memory map entries and fix boot regression Some machines, such as the Lenovo ThinkPad W541 with firmware GNET80WW (2.28), include memory map entries with phys_addr=0x0 and num_pages=0. These machines fail to boot after the following commit, commit 8e80632fb23f ("efi/esrt: Use efi_mem_reserve() and avoid a kmalloc()") Fix this by removing such bogus entries from the memory map. Furthermore, currently the log output for this case (with efi=debug) looks like: [ 0.000000] efi: mem45: [Reserved | | | | | | | | | | | | ] range=[0x0000000000000000-0xffffffffffffffff] (0MB) This is clearly wrong, and also not as informative as it could be. This patch changes it so that if we find obviously invalid memory map entries, we print an error and skip those entries. It also detects the display of the address range calculation overflow, so the new output is: [ 0.000000] efi: [Firmware Bug]: Invalid EFI memory map entries: [ 0.000000] efi: mem45: [Reserved | | | | | | | | | | | | ] range=[0x0000000000000000-0x0000000000000000] (invalid) It also detects memory map sizes that would overflow the physical address, for example phys_addr=0xfffffffffffff000 and num_pages=0x0200000000000001, and prints: [ 0.000000] efi: [Firmware Bug]: Invalid EFI memory map entries: [ 0.000000] efi: mem45: [Reserved | | | | | | | | | | | | ] range=[phys_addr=0xfffffffffffff000-0x20ffffffffffffffff] (invalid) It then removes these entries from the memory map. Signed-off-by: Peter Jones Signed-off-by: Ard Biesheuvel [ardb: refactor for clarity with no functional changes, avoid PAGE_SHIFT] Signed-off-by: Matt Fleming [Matt: Include bugzilla info in commit log] Cc: # v4.9+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://bugzilla.kernel.org/show_bug.cgi?id=191121 Signed-off-by: Ingo Molnar --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 0c5420208c40..5b1af30ece55 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -103,6 +103,7 @@ typedef struct { #define EFI_PAGE_SHIFT 12 #define EFI_PAGE_SIZE (1UL << EFI_PAGE_SHIFT) +#define EFI_PAGES_MAX (U64_MAX >> EFI_PAGE_SHIFT) typedef struct { u32 type; -- cgit v1.2.3 From 4d22c75d4c7b5c5f4bd31054f09103ee490878fd Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 11 Jan 2017 13:25:00 -0600 Subject: coredump: Ensure proper size of sparse core files If the last section of a core file ends with an unmapped or zero page, the size of the file does not correspond with the last dump_skip() call. gdb complains that the file is truncated and can be confusing to users. After all of the vma sections are written, make sure that the file size is no smaller than the current file position. This problem can be demonstrated with gdb's bigcore testcase on the sparc architecture. Signed-off-by: Dave Kleikamp Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Al Viro --- include/linux/coredump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index d016a121a8c4..28ffa94aed6b 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -14,6 +14,7 @@ struct coredump_params; extern int dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); +extern void dump_truncate(struct coredump_params *cprm); #ifdef CONFIG_COREDUMP extern void do_coredump(const siginfo_t *siginfo); #else -- cgit v1.2.3 From 52d7e48b86fc108e45a656d8e53e4237993c481d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2017 02:28:26 -0800 Subject: rcu: Narrow early boot window of illegal synchronous grace periods The current preemptible RCU implementation goes through three phases during bootup. In the first phase, there is only one CPU that is running with preemption disabled, so that a no-op is a synchronous grace period. In the second mid-boot phase, the scheduler is running, but RCU has not yet gotten its kthreads spawned (and, for expedited grace periods, workqueues are not yet running. During this time, any attempt to do a synchronous grace period will hang the system (or complain bitterly, depending). In the third and final phase, RCU is fully operational and everything works normally. This has been OK for some time, but there has recently been some synchronous grace periods showing up during the second mid-boot phase. This code worked "by accident" for awhile, but started failing as soon as expedited RCU grace periods switched over to workqueues in commit 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue"). Note that the code was buggy even before this commit, as it was subject to failure on real-time systems that forced all expedited grace periods to run as normal grace periods (for example, using the rcu_normal ksysfs parameter). The callchain from the failure case is as follows: early_amd_iommu_init() |-> acpi_put_table(ivrs_base); |-> acpi_tb_put_table(table_desc); |-> acpi_tb_invalidate_table(table_desc); |-> acpi_tb_release_table(...) |-> acpi_os_unmap_memory |-> acpi_os_unmap_iomem |-> acpi_os_map_cleanup |-> synchronize_rcu_expedited The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y, which caused the code to try using workqueues before they were initialized, which did not go well. This commit therefore reworks RCU to permit synchronous grace periods to proceed during this mid-boot phase. This commit is therefore a fix to a regression introduced in v4.9, and is therefore being put forward post-merge-window in v4.10. This commit sets a flag from the existing rcu_scheduler_starting() function which causes all synchronous grace periods to take the expedited path. The expedited path now checks this flag, using the requesting task to drive the expedited grace period forward during the mid-boot phase. Finally, this flag is updated by a core_initcall() function named rcu_exp_runtime_mode(), which causes the runtime codepaths to be used. Note that this arrangement assumes that tasks are not sent POSIX signals (or anything similar) from the time that the first task is spawned through core_initcall() time. Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue") Reported-by: "Zheng, Lv" Reported-by: Borislav Petkov Signed-off-by: Paul E. McKenney Tested-by: Stan Kain Tested-by: Ivan Tested-by: Emanuel Castelo Tested-by: Bruno Pesavento Tested-by: Borislav Petkov Tested-by: Frederic Bezies Cc: # 4.9.0- --- include/linux/rcupdate.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 321f9ed552a9..01f71e1d2e94 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -444,6 +444,10 @@ bool __rcu_is_watching(void); #error "Unknown RCU implementation specified to kernel configuration" #endif +#define RCU_SCHEDULER_INACTIVE 0 +#define RCU_SCHEDULER_INIT 1 +#define RCU_SCHEDULER_RUNNING 2 + /* * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic * initialization and destruction of rcu_head on the stack. rcu_head structures -- cgit v1.2.3 From 4205e4786d0b9fc3b4fec7b1910cf645a0468307 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jan 2017 14:01:05 +0100 Subject: cpu/hotplug: Provide dynamic range for prepare stage Mathieu reported that the LTTNG modules are broken as of 4.10-rc1 due to the removal of the cpu hotplug notifiers. Usually I don't care much about out of tree modules, but LTTNG is widely used in distros. There are two ways to solve that: 1) Reserve a hotplug state for LTTNG 2) Add a dynamic range for the prepare states. While #1 is the simplest solution, #2 is the proper one as we can convert in tree users, which do not care about ordering, to the dynamic range as well. Add a dynamic range which allows LTTNG to request states in the prepare stage. Reported-and-tested-by: Mathieu Desnoyers Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Sebastian Sewior Cc: Steven Rostedt Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1701101353010.3401@nanos Signed-off-by: Thomas Gleixner --- include/linux/cpuhotplug.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 20bfefbe7594..d936a0021839 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -74,6 +74,8 @@ enum cpuhp_state { CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_MIPS_SOC_PREPARE, + CPUHP_BP_PREPARE_DYN, + CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, -- cgit v1.2.3 From f1f7714ea51c56b7163fb1a5acf39c6a204dd758 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 13 Jan 2017 23:38:15 +0100 Subject: bpf: rework prog_digest into prog_tag Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink") was recently discussed, partially due to admittedly suboptimal name of "prog_digest" in combination with sha1 hash usage, thus inevitably and rightfully concerns about its security in terms of collision resistance were raised with regards to use-cases. The intended use cases are for debugging resp. introspection only for providing a stable "tag" over the instruction sequence that both kernel and user space can calculate independently. It's not usable at all for making a security relevant decision. So collisions where two different instruction sequences generate the same tag can happen, but ideally at a rather low rate. The "tag" will be dumped in hex and is short enough to introspect in tracepoints or kallsyms output along with other data such as stack trace, etc. Thus, this patch performs a rename into prog_tag and truncates the tag to a short output (64 bits) to make it obvious it's not collision-free. Should in future a hash or facility be needed with a security relevant focus, then we can think about requirements, constraints, etc that would fit to that situation. For now, rework the exposed parts for the current use cases as long as nothing has been released yet. Tested on x86_64 and s390x. Fixes: 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Andy Lutomirski Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 +- include/linux/filter.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f74ae68086dc..05cf951df3fe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -216,7 +216,7 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); -int bpf_prog_calc_digest(struct bpf_prog *fp); +int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); diff --git a/include/linux/filter.h b/include/linux/filter.h index a0934e6c9bab..e4eb2546339a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -57,6 +57,8 @@ struct bpf_prog_aux; /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 +#define BPF_TAG_SIZE 8 + /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ @@ -408,7 +410,7 @@ struct bpf_prog { kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ - u32 digest[SHA_DIGEST_WORDS]; /* Program digest */ + u8 tag[BPF_TAG_SIZE]; struct bpf_prog_aux *aux; /* Auxiliary fields */ struct sock_fprog_kern *orig_prog; /* Original BPF program */ unsigned int (*bpf_func)(const void *ctx, @@ -519,7 +521,7 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) return prog->len * sizeof(struct bpf_insn); } -static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog) +static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) { return round_up(bpf_prog_insn_size(prog) + sizeof(__be64) + 1, SHA_MESSAGE_BYTES); -- cgit v1.2.3 From 5eb7c0d04f04a667c049fe090a95494a8de2955c Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Sun, 1 Jan 2017 20:25:25 -0600 Subject: taint/module: Fix problems when out-of-kernel driver defines true or false Commit 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") used the key words true and false as character members of a new struct. These names cause problems when out-of-kernel modules such as VirtualBox include their own definitions of true and false. Fixes: 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") Signed-off-by: Larry Finger Cc: Petr Mladek Cc: Jessica Yu Cc: Rusty Russell Reported-by: Valdis Kletnieks Reviewed-by: Petr Mladek Acked-by: Rusty Russell Signed-off-by: Jessica Yu --- include/linux/kernel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 56aec84237ad..cb09238f6d32 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -514,8 +514,8 @@ extern enum system_states { #define TAINT_FLAGS_COUNT 16 struct taint_flag { - char true; /* character printed when tainted */ - char false; /* character printed when not tainted */ + char c_true; /* character printed when tainted */ + char c_false; /* character printed when not tainted */ bool module; /* also show as a per-module taint flag */ }; -- cgit v1.2.3 From 501db511397fd6efff3aa5b4e8de415b55559550 Mon Sep 17 00:00:00 2001 From: Rolf Neugebauer Date: Tue, 17 Jan 2017 18:13:51 +0000 Subject: virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit This patch part reverts fd2a0437dc33 and e858fae2b0b8 which introduced a subtle change in how the virtio_net flags are derived from the SKBs ip_summed field. With the above commits, the flags are set to VIRTIO_NET_HDR_F_DATA_VALID when ip_summed == CHECKSUM_UNNECESSARY, thus treating it differently to ip_summed == CHECKSUM_NONE, which should be the same. Further, the virtio spec 1.0 / CS04 explicitly says that VIRTIO_NET_HDR_F_DATA_VALID must not be set by the driver. Fixes: fd2a0437dc33 ("virtio_net: introduce virtio_net_hdr_{from,to}_skb") Fixes: e858fae2b0b8 (" virtio_net: use common code for virtio_net_hdr and skb GSO conversion") Signed-off-by: Rolf Neugebauer Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 66204007d7ac..56436472ccc7 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -91,8 +91,6 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, skb_checksum_start_offset(skb)); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; -- cgit v1.2.3 From d407bd25a204bd66b7346dde24bd3d37ef0e0b05 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 18 Jan 2017 15:14:17 +0100 Subject: bpf: don't trigger OOM killer under pressure with map alloc This patch adds two helpers, bpf_map_area_alloc() and bpf_map_area_free(), that are to be used for map allocations. Using kmalloc() for very large allocations can cause excessive work within the page allocator, so i) fall back earlier to vmalloc() when the attempt is considered costly anyway, and even more importantly ii) don't trigger OOM killer with any of the allocators. Since this is based on a user space request, for example, when creating maps with element pre-allocation, we really want such requests to fail instead of killing other user space processes. Also, don't spam the kernel log with warnings should any of the allocations fail under pressure. Given that, we can make backend selection in bpf_map_area_alloc() generic, and convert all maps over to use this API for spots with potentially large allocation requests. Note, replacing the one kmalloc_array() is fine as overflow checks happen earlier in htab_map_alloc(), since it must also protect the multiplication for vmalloc() should kmalloc_array() fail. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05cf951df3fe..3ed1f3b1d594 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -247,6 +247,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); +void *bpf_map_area_alloc(size_t size); +void bpf_map_area_free(void *base); extern int sysctl_unprivileged_bpf_disabled; -- cgit v1.2.3 From 739e6f5945d88dcee01590913f6886132a10c215 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 11 Jan 2017 13:37:07 +0100 Subject: gpio: provide lockdep keys for nested/unnested irqchips The helper function for adding a GPIO chip compiles in a lockdep key for debugging, the same key is needed for nested chips as well. The macro construction is unreadable, replace this with two static inlines instead. The _gpiochip_irqchip_add prefixed function is not helpful, rename it with gpiochip_irqchip_add_key() that tell us what the function is actually doing. Fixes: d245b3f9bd36 ("gpio: simplify adding threaded interrupts") Cc: Roger Quadros Reported-by: Clemens Gruber Reported-by: Roger Quadros Reported-by: Grygorii Strashko Tested-by: Clemens Gruber Tested-by: Grygorii Strashko Signed-off-by: Linus Walleij --- include/linux/gpio/driver.h | 70 ++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index c2748accea71..e973faba69dc 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -274,37 +274,67 @@ void gpiochip_set_nested_irqchip(struct gpio_chip *gpiochip, struct irq_chip *irqchip, int parent_irq); -int _gpiochip_irqchip_add(struct gpio_chip *gpiochip, +int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip, + struct irq_chip *irqchip, + unsigned int first_irq, + irq_flow_handler_t handler, + unsigned int type, + bool nested, + struct lock_class_key *lock_key); + +#ifdef CONFIG_LOCKDEP + +/* + * Lockdep requires that each irqchip instance be created with a + * unique key so as to avoid unnecessary warnings. This upfront + * boilerplate static inlines provides such a key for each + * unique instance. + */ +static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, + struct irq_chip *irqchip, + unsigned int first_irq, + irq_flow_handler_t handler, + unsigned int type) +{ + static struct lock_class_key key; + + return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, + handler, type, false, &key); +} + +static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, struct irq_chip *irqchip, unsigned int first_irq, irq_flow_handler_t handler, - unsigned int type, - bool nested, - struct lock_class_key *lock_key); + unsigned int type) +{ + + static struct lock_class_key key; + + return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, + handler, type, true, &key); +} +#else +static inline int gpiochip_irqchip_add(struct gpio_chip *gpiochip, + struct irq_chip *irqchip, + unsigned int first_irq, + irq_flow_handler_t handler, + unsigned int type) +{ + return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, + handler, type, false, NULL); +} -/* FIXME: I assume threaded IRQchips do not have the lockdep problem */ static inline int gpiochip_irqchip_add_nested(struct gpio_chip *gpiochip, struct irq_chip *irqchip, unsigned int first_irq, irq_flow_handler_t handler, unsigned int type) { - return _gpiochip_irqchip_add(gpiochip, irqchip, first_irq, - handler, type, true, NULL); + return gpiochip_irqchip_add_key(gpiochip, irqchip, first_irq, + handler, type, true, NULL); } - -#ifdef CONFIG_LOCKDEP -#define gpiochip_irqchip_add(...) \ -( \ - ({ \ - static struct lock_class_key _key; \ - _gpiochip_irqchip_add(__VA_ARGS__, false, &_key); \ - }) \ -) -#else -#define gpiochip_irqchip_add(...) \ - _gpiochip_irqchip_add(__VA_ARGS__, false, NULL) -#endif +#endif /* CONFIG_LOCKDEP */ #endif /* CONFIG_GPIOLIB_IRQCHIP */ -- cgit v1.2.3 From e326ce013a8e851193eb337aafb1aa396c533a61 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 20 Jan 2017 03:25:34 +0100 Subject: Revert "PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag" Revert commit 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag) as it caused system suspend (in the default configuration) to fail on Dell XPS13 (9360) with the Kaby Lake processor. Fixes: 08b98d329165 (PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag) Reported-by: Paul Menzel Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 0c729c3c8549..d9718378a8be 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -194,8 +194,6 @@ struct platform_freeze_ops { }; #ifdef CONFIG_SUSPEND -extern suspend_state_t mem_sleep_default; - /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. -- cgit v1.2.3 From 6391a4481ba0796805d6581e42f9f0418c099e34 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 20 Jan 2017 14:32:42 +0800 Subject: virtio-net: restore VIRTIO_HDR_F_DATA_VALID on receiving Commit 501db511397f ("virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit") in fact disables VIRTIO_HDR_F_DATA_VALID on receiving path too, fixing this by adding a hint (has_data_valid) and set it only on the receiving path. Cc: Rolf Neugebauer Signed-off-by: Jason Wang Acked-by: Rolf Neugebauer Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 56436472ccc7..5209b5ed2a64 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -56,7 +56,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, struct virtio_net_hdr *hdr, - bool little_endian) + bool little_endian, + bool has_data_valid) { memset(hdr, 0, sizeof(*hdr)); /* no info leak */ @@ -91,6 +92,9 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, skb_checksum_start_offset(skb)); hdr->csum_offset = __cpu_to_virtio16(little_endian, skb->csum_offset); + } else if (has_data_valid && + skb->ip_summed == CHECKSUM_UNNECESSARY) { + hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ return 0; -- cgit v1.2.3 From 059aa734824165507c65fd30a55ff000afd14983 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 22 Jan 2017 14:04:29 -0500 Subject: nfs: Don't increment lock sequence ID after NFS4ERR_MOVED Xuan Qi reports that the Linux NFSv4 client failed to lock a file that was migrated. The steps he observed on the wire: 1. The client sent a LOCK request to the source server 2. The source server replied NFS4ERR_MOVED 3. The client switched to the destination server 4. The client sent the same LOCK request to the destination server with a bumped lock sequence ID 5. The destination server rejected the LOCK request with NFS4ERR_BAD_SEQID RFC 3530 section 8.1.5 provides a list of NFS errors which do not bump a lock sequence ID. However, RFC 3530 is now obsoleted by RFC 7530. In RFC 7530 section 9.1.7, this list has been updated by the addition of NFS4ERR_MOVED. Reported-by: Xuan Qi Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org # v3.7+ Signed-off-by: Trond Myklebust --- include/linux/nfs4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index bca536341d1a..1b1ca04820a3 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -282,7 +282,7 @@ enum nfsstat4 { static inline bool seqid_mutating_err(u32 err) { - /* rfc 3530 section 8.1.5: */ + /* See RFC 7530, section 9.1.7 */ switch (err) { case NFS4ERR_STALE_CLIENTID: case NFS4ERR_STALE_STATEID: @@ -291,6 +291,7 @@ static inline bool seqid_mutating_err(u32 err) case NFS4ERR_BADXDR: case NFS4ERR_RESOURCE: case NFS4ERR_NOFILEHANDLE: + case NFS4ERR_MOVED: return false; }; return true; -- cgit v1.2.3 From c929ea0b910355e1876c64431f3d5802f95b3d75 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 20 Jan 2017 16:48:39 +0800 Subject: SUNRPC: cleanup ida information when removing sunrpc module After removing sunrpc module, I get many kmemleak information as, unreferenced object 0xffff88003316b1e0 (size 544): comm "gssproxy", pid 2148, jiffies 4294794465 (age 4200.081s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc+0x15e/0x1f0 [] ida_pre_get+0xaa/0x150 [] ida_simple_get+0xad/0x180 [] nlmsvc_lookup_host+0x4ab/0x7f0 [lockd] [] lockd+0x4d/0x270 [lockd] [] param_set_timeout+0x55/0x100 [lockd] [] svc_defer+0x114/0x3f0 [sunrpc] [] svc_defer+0x2d7/0x3f0 [sunrpc] [] rpc_show_info+0x8a/0x110 [sunrpc] [] proc_reg_write+0x7f/0xc0 [] __vfs_write+0xdf/0x3c0 [] vfs_write+0xef/0x240 [] SyS_write+0xad/0x130 [] entry_SYSCALL_64_fastpath+0x1a/0xa9 [] 0xffffffffffffffff I found, the ida information (dynamic memory) isn't cleanup. Signed-off-by: Kinglong Mee Fixes: 2f048db4680a ("SUNRPC: Add an identifier for struct rpc_clnt") Cc: stable@vger.kernel.org # v3.12+ Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 85cc819676e8..333ad11b3dd9 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -216,5 +216,6 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); +void rpc_cleanup_clids(void); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ -- cgit v1.2.3 From 8a1f780e7f28c7c1d640118242cf68d528c456cd Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Tue, 24 Jan 2017 15:17:45 -0800 Subject: memory_hotplug: make zone_can_shift() return a boolean value online_{kernel|movable} is used to change the memory zone to ZONE_{NORMAL|MOVABLE} and online the memory. To check that memory zone can be changed, zone_can_shift() is used. Currently the function returns minus integer value, plus integer value and 0. When the function returns minus or plus integer value, it means that the memory zone can be changed to ZONE_{NORNAL|MOVABLE}. But when the function returns 0, there are two meanings. One of the meanings is that the memory zone does not need to be changed. For example, when memory is in ZONE_NORMAL and onlined by online_kernel the memory zone does not need to be changed. Another meaning is that the memory zone cannot be changed. When memory is in ZONE_NORMAL and onlined by online_movable, the memory zone may not be changed to ZONE_MOVALBE due to memory online limitation(see Documentation/memory-hotplug.txt). In this case, memory must not be onlined. The patch changes the return type of zone_can_shift() so that memory online operation fails when memory zone cannot be changed as follows: Before applying patch: # grep -A 35 "Node 2" /proc/zoneinfo Node 2, zone Normal node_scanned 0 spanned 8388608 present 7864320 managed 7864320 # echo online_movable > memory4097/state # grep -A 35 "Node 2" /proc/zoneinfo Node 2, zone Normal node_scanned 0 spanned 8388608 present 8388608 managed 8388608 online_movable operation succeeded. But memory is onlined as ZONE_NORMAL, not ZONE_MOVABLE. After applying patch: # grep -A 35 "Node 2" /proc/zoneinfo Node 2, zone Normal node_scanned 0 spanned 8388608 present 7864320 managed 7864320 # echo online_movable > memory4097/state bash: echo: write error: Invalid argument # grep -A 35 "Node 2" /proc/zoneinfo Node 2, zone Normal node_scanned 0 spanned 8388608 present 7864320 managed 7864320 online_movable operation failed because of failure of changing the memory zone from ZONE_NORMAL to ZONE_MOVABLE Fixes: df429ac03936 ("memory-hotplug: more general validation of zone during online") Link: http://lkml.kernel.org/r/2f9c3837-33d7-b6e5-59c0-6ca4372b2d84@gmail.com Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Reza Arbab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 01033fadea47..c1784c0b4f35 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -284,7 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target); +extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, + enum zone_type target, int *zone_shift); #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3 From b94f51183b0617e7b9b4fb4137d4cf1cab7547c2 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 24 Jan 2017 15:17:53 -0800 Subject: kernel/watchdog: prevent false hardlockup on overloaded system On an overloaded system, it is possible that a change in the watchdog threshold can be delayed long enough to trigger a false positive. This can easily be achieved by having a cpu spinning indefinitely on a task, while another cpu updates watchdog threshold. What happens is while trying to park the watchdog threads, the hrtimers on the other cpus trigger and reprogram themselves with the new slower watchdog threshold. Meanwhile, the nmi watchdog is still programmed with the old faster threshold. Because the one cpu is blocked, it prevents the thread parking on the other cpus from completing, which is needed to shutdown the nmi watchdog and reprogram it correctly. As a result, a false positive from the nmi watchdog is reported. Fix this by setting a park_in_progress flag to block all lockups until the parking is complete. Fix provided by Ulrich Obergfell. [akpm@linux-foundation.org: s/park_in_progress/watchdog_park_in_progress/] Link: http://lkml.kernel.org/r/1481041033-192236-1-git-send-email-dzickus@redhat.com Signed-off-by: Don Zickus Reviewed-by: Aaron Tomlin Cc: Ulrich Obergfell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index aacca824a6ae..0a3fadc32693 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -110,6 +110,7 @@ extern int watchdog_user_enabled; extern int watchdog_thresh; extern unsigned long watchdog_enabled; extern unsigned long *watchdog_cpumask_bits; +extern atomic_t watchdog_park_in_progress; #ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; -- cgit v1.2.3 From ea57485af8f4221312a5a95d63c382b45e7840dc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 24 Jan 2017 15:18:32 -0800 Subject: mm, page_alloc: fix check for NULL preferred_zone Patch series "fix premature OOM regression in 4.7+ due to cpuset races". This is v2 of my attempt to fix the recent report based on LTP cpuset stress test [1]. The intention is to go to stable 4.9 LTSS with this, as triggering repeated OOMs is not nice. That's why the patches try to be not too intrusive. Unfortunately why investigating I found that modifying the testcase to use per-VMA policies instead of per-task policies will bring the OOM's back, but that seems to be much older and harder to fix problem. I have posted a RFC [2] but I believe that fixing the recent regressions has a higher priority. Longer-term we might try to think how to fix the cpuset mess in a better and less error prone way. I was for example very surprised to learn, that cpuset updates change not only task->mems_allowed, but also nodemask of mempolicies. Until now I expected the parameter to alloc_pages_nodemask() to be stable. I wonder why do we then treat cpusets specially in get_page_from_freelist() and distinguish HARDWALL etc, when there's unconditional intersection between mempolicy and cpuset. I would expect the nodemask adjustment for saving overhead in g_p_f(), but that clearly doesn't happen in the current form. So we have both crazy complexity and overhead, AFAICS. [1] https://lkml.kernel.org/r/CAFpQJXUq-JuEP=QPidy4p_=FN0rkH5Z-kfB4qBvsf6jMS87Edg@mail.gmail.com [2] https://lkml.kernel.org/r/7c459f26-13a6-a817-e508-b65b903a8378@suse.cz This patch (of 4): Since commit c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice") we have a wrong check for NULL preferred_zone, which can theoretically happen due to concurrent cpuset modification. We check the zoneref pointer which is never NULL and we should check the zone pointer. Also document this in first_zones_zonelist() comment per Michal Hocko. Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice") Link: http://lkml.kernel.org/r/20170120103843.24587-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Acked-by: Hillf Danton Cc: Ganapatrao Kulkarni Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 36d9896fbc1e..f4aac87adcc3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -972,12 +972,16 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, * @zonelist - The zonelist to search for a suitable zone * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with - * @zone - The first suitable zone found is returned via this parameter + * @return - Zoneref pointer for the first suitable zone found (see below) * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. + * + * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is + * never NULL). This may happen either genuinely, or due to concurrent nodemask + * update due to cpuset modification. */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, -- cgit v1.2.3 From d6f8cfa3dea294eabf8f302e90176dd6381fb66e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 25 Jan 2017 11:39:49 +0100 Subject: net: phy: leds: Break dependency of phy.h on phy_led_triggers.h includes , which is not really needed. Drop the include from , and add it to all users that didn't include it explicitly. Suggested-by: Andrew Lunn Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index f7d95f644eed..7fc1105605bf 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -25,7 +25,6 @@ #include #include #include -#include #include -- cgit v1.2.3 From 3c880eb0205222bb062970085ebedc73ec8dfd14 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 25 Jan 2017 11:39:50 +0100 Subject: net: phy: leds: Fix truncated LED trigger names Commit 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and bus_id") increased the size of MII bus IDs, but forgot to update the private definition in . This may cause: 1. Truncation of LED trigger names, 2. Duplicate LED trigger names, 3. Failures registering LED triggers, 4. Crashes due to bad error handling in the LED trigger failure path. To fix this, and prevent the definitions going out of sync again in the future, let the PHY LED trigger code use the existing MII_BUS_ID_SIZE definition. Example: - Before I had triggers "ee700000.etherne:01:100Mbps" and "ee700000.etherne:01:10Mbps", - After the increase of MII_BUS_ID_SIZE, both became "ee700000.ethernet-ffffffff:01:" => FAIL, - Now, the triggers are "ee700000.ethernet-ffffffff:01:100Mbps" and "ee700000.ethernet-ffffffff:01:10Mbps", which are unique again. Fixes: 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and bus_id") Fixes: 2e0bc452f4721520 ("net: phy: leds: add support for led triggers on phy link state change") Signed-off-by: Geert Uytterhoeven Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy_led_triggers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy_led_triggers.h b/include/linux/phy_led_triggers.h index a2daea0a37d2..b37b05bfd1a6 100644 --- a/include/linux/phy_led_triggers.h +++ b/include/linux/phy_led_triggers.h @@ -18,11 +18,11 @@ struct phy_device; #ifdef CONFIG_LED_TRIGGER_PHY #include +#include #define PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE 10 -#define PHY_MII_BUS_ID_SIZE (20 - 3) -#define PHY_LINK_LED_TRIGGER_NAME_SIZE (PHY_MII_BUS_ID_SIZE + \ +#define PHY_LINK_LED_TRIGGER_NAME_SIZE (MII_BUS_ID_SIZE + \ FIELD_SIZEOF(struct mdio_device, addr)+\ PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE) -- cgit v1.2.3 From 9d162ed69f51cbd9ee5a0c7e82aba7acc96362ff Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Fri, 27 Jan 2017 08:46:23 +0100 Subject: net: phy: micrel: add support for KSZ8795 This is adds support for the PHYs in the KSZ8795 5port managed switch. It will allow to detect the link between the switch and the soc and uses the same read_status functions as the KSZ8873MLL switch. Signed-off-by: Sean Nyekjaer Signed-off-by: David S. Miller --- include/linux/micrel_phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 257173e0095e..f541da68d1e7 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -35,6 +35,8 @@ #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 +#define PHY_ID_KSZ8795 0x00221550 + /* struct phy_device dev_flags definitions */ #define MICREL_PHY_50MHZ_CLK 0x00000001 #define MICREL_PHY_FXEN 0x00000002 -- cgit v1.2.3