From b2ca916ce392a9d4cea3489a3efb2b627b839eaf Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:00:48 -0800 Subject: ACPI: NUMA: Up-level "map to online node" functionality The acpi_map_pxm_to_online_node() helper is used to find the closest online node to a given proximity domain. This is used to map devices in a proximity domain with no online memory or cpus to the closest online node and populate a device's 'numa_node' property. The numa_node property allows applications to be migrated "close" to a resource. In preparation for providing a generic facility to optionally map an address range to its closest online node, or the node the range would represent were it to be onlined (target_node), up-level the core of acpi_map_pxm_to_online_node() to a generic mm/numa helper. Cc: Michal Hocko Acked-by: Rafael J. Wysocki Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/158188324802.894464.13128795207831894206.stgit@dwillia2-desk3.amr.corp.intel.com --- include/linux/acpi.h | 23 ++++++++++++++++++++++- include/linux/numa.h | 9 +++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 0f24d701fbdc..3839363081f3 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -416,9 +416,30 @@ extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA -int acpi_map_pxm_to_online_node(int pxm); int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); + +/** + * acpi_map_pxm_to_online_node - Map proximity ID to online node + * @pxm: ACPI proximity ID + * + * This is similar to acpi_map_pxm_to_node(), but always returns an online + * node. When the mapped node from a given proximity ID is offline, it + * looks up the node distance table and returns the nearest online node. + * + * ACPI device drivers, which are called after the NUMA initialization has + * completed in the kernel, can call this interface to obtain their device + * NUMA topology from ACPI tables. Such drivers do not have to deal with + * offline nodes. A node may be offline when a device proximity ID is + * unique, SRAT memory entry does not exist, or NUMA is disabled, ex. + * "numa=off" on x86. + */ +static inline int acpi_map_pxm_to_online_node(int pxm) +{ + int node = acpi_map_pxm_to_node(pxm); + + return numa_map_to_online_node(node); +} #else static inline int acpi_map_pxm_to_online_node(int pxm) { diff --git a/include/linux/numa.h b/include/linux/numa.h index 110b0e5d0fb0..20f4e44b186c 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -13,4 +13,13 @@ #define NUMA_NO_NODE (-1) +#ifdef CONFIG_NUMA +int numa_map_to_online_node(int node); +#else +static inline int numa_map_to_online_node(int node) +{ + return NUMA_NO_NODE; +} +#endif + #endif /* _LINUX_NUMA_H */ -- cgit v1.2.3 From 1e5d8e1e47afde23e3249aed25d7d124feff5c1c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:04 -0800 Subject: x86/mm: Introduce CONFIG_NUMA_KEEP_MEMINFO Currently x86 numa_meminfo is marked __initdata in the CONFIG_MEMORY_HOTPLUG=n case. In support of a new facility to allow drivers to map reserved memory to a 'target_node' (phys_to_target_node()), add support for removing the __initdata designation for those users. Both memory hotplug and phys_to_target_node() users select CONFIG_NUMA_KEEP_MEMINFO to tell the arch to maintain its physical address to NUMA mapping infrastructure post init. Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/158188326422.894464.15742054998046628934.stgit@dwillia2-desk3.amr.corp.intel.com --- include/linux/numa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index 20f4e44b186c..5773cd2613fc 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -13,6 +13,13 @@ #define NUMA_NO_NODE (-1) +/* optionally keep NUMA memory info available post init */ +#ifdef CONFIG_NUMA_KEEP_MEMINFO +#define __initdata_or_meminfo +#else +#define __initdata_or_meminfo __initdata +#endif + #ifdef CONFIG_NUMA int numa_map_to_online_node(int node); #else -- cgit v1.2.3 From 5d30f92e7631286b8617777c5400c8eadcae50a1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:09 -0800 Subject: x86/NUMA: Provide a range-to-target_node lookup facility The DEV_DAX_KMEM facility is a generic mechanism to allow device-dax instances, fronting performance-differentiated-memory like pmem, to be added to the System RAM pool. The NUMA node for that hot-added memory is derived from the device-dax instance's 'target_node' attribute. Recall that the 'target_node' is the ACPI-PXM-to-node translation for memory when it comes online whereas the 'numa_node' attribute of the device represents the closest online cpu node. Presently useful target_node information from the ACPI SRAT is discarded with the expectation that "Reserved" memory will never be onlined. Now, DEV_DAX_KMEM violates that assumption, there is a need to retain the translation. Move, rather than discard, numa_memblk data to a secondary array that memory_add_physaddr_to_target_node() may consider at a later point in time. Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Reported-by: kbuild test robot Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com --- include/linux/numa.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index 5773cd2613fc..a42df804679e 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NUMA_H #define _LINUX_NUMA_H - +#include #ifdef CONFIG_NODES_SHIFT #define NODES_SHIFT CONFIG_NODES_SHIFT @@ -21,12 +21,24 @@ #endif #ifdef CONFIG_NUMA +/* Generic implementation available */ int numa_map_to_online_node(int node); + +/* + * Optional architecture specific implementation, users need a "depends + * on $ARCH" + */ +int phys_to_target_node(phys_addr_t addr); #else static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } + +static inline int phys_to_target_node(phys_addr_t addr) +{ + return NUMA_NO_NODE; +} #endif #endif /* _LINUX_NUMA_H */ -- cgit v1.2.3 From 1d0827b75ee7df497f611a2ac412a88135fb0ef5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 30 Jan 2020 12:06:01 -0800 Subject: mm/memremap_pages: Kill unused __devm_memremap_pages() Kill this definition that was introduced in commit 41e94a851304 ("add devm_memremap_pages") add never used. Cc: Christoph Hellwig Reviewed-by: Aneesh Kumar K.V Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/158041476158.3889308.4221100673554151124.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/io.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io.h b/include/linux/io.h index b1c44bb4b2d7..8394c56babc2 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -77,8 +77,6 @@ void *devm_memremap(struct device *dev, resource_size_t offset, size_t size, unsigned long flags); void devm_memunmap(struct device *dev, void *addr); -void *__devm_memremap_pages(struct device *dev, struct resource *res); - #ifdef CONFIG_PCI /* * The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and -- cgit v1.2.3 From 9ffc1d19fc4a6dfcfe06c91c2861ad6d44fdd92d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 30 Jan 2020 12:06:07 -0800 Subject: mm/memremap_pages: Introduce memremap_compat_align() The "sub-section memory hotplug" facility allows memremap_pages() users like libnvdimm to compensate for hardware platforms like x86 that have a section size larger than their hardware memory mapping granularity. The compensation that sub-section support affords is being tolerant of physical memory resources shifting by units smaller (64MiB on x86) than the memory-hotplug section size (128 MiB). Where the platform physical-memory mapping granularity is limited by the number and capability of address-decode-registers in the memory controller. While the sub-section support allows memremap_pages() to operate on sub-section (2MiB) granularity, the Power architecture may still require 16MiB alignment on "!radix_enabled()" platforms. In order for libnvdimm to be able to detect and manage this per-arch limitation, introduce memremap_compat_align() as a common minimum alignment across all driver-facing memory-mapping interfaces, and let Power override it to 16MiB in the "!radix_enabled()" case. The assumption / requirement for 16MiB to be a viable memremap_compat_align() value is that Power does not have platforms where its equivalent of address-decode-registers never hardware remaps a persistent memory resource on smaller than 16MiB boundaries. Note that I tried my best to not add a new Kconfig symbol, but header include entanglements defeated the #ifndef memremap_compat_align design pattern and the need to export it defeats the __weak design pattern for arch overrides. Based on an initial patch by Aneesh. Link: http://lore.kernel.org/r/CAPcyv4gBGNP95APYaBcsocEa50tQj9b5h__83vgngjq3ouGX_Q@mail.gmail.com Reported-by: Aneesh Kumar K.V Reported-by: Jeff Moyer Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Reviewed-by: Aneesh Kumar K.V Acked-by: Michael Ellerman (powerpc) Signed-off-by: Dan Williams --- include/linux/memremap.h | 8 ++++++++ include/linux/mmzone.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 6fefb09af7c3..8af1cbd8f293 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -132,6 +132,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); +unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) @@ -165,6 +166,12 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { } + +/* when memremap_pages() is disabled all archs can remap a single page */ +static inline unsigned long memremap_compat_align(void) +{ + return PAGE_SIZE; +} #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) @@ -172,4 +179,5 @@ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) if (pgmap) percpu_ref_put(pgmap->ref); } + #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 462f6873905a..6b77f7239af5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1170,6 +1170,7 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 +#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) -- cgit v1.2.3 From a0e374525def2ef18a078523e1faefb5ce2b05e5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 30 Jan 2020 12:06:18 -0800 Subject: libnvdimm/region: Introduce NDD_LABELING The NDD_ALIASING flag is used to indicate where pmem capacity might alias with blk capacity and require labeling. It is also used to indicate whether the DIMM supports labeling. Separate this latter capability into its own flag so that the NDD_ALIASING flag is scoped to true aliased configurations. To my knowledge aliased configurations only exist in the ACPI spec, there are no known platforms that ship this support in production. This clarity allows namespace-capacity alignment constraints around interleave-ways to be relaxed. Cc: Vishal Verma Cc: Oliver O'Halloran Reviewed-by: Jeff Moyer Reviewed-by: Aneesh Kumar K.V Link: https://lore.kernel.org/r/158041477856.3889308.4212605617834097674.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- include/linux/libnvdimm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 9df091bd30ba..18da4059be09 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -37,6 +37,8 @@ enum { NDD_WORK_PENDING = 4, /* ignore / filter NSLABEL_FLAG_LOCAL for this DIMM, i.e. no aliasing */ NDD_NOBLK = 5, + /* dimm supports namespace labels */ + NDD_LABELING = 6, /* need to set a limit somewhere, but yes, this is likely overkill */ ND_IOCTL_MAX_BUFLEN = SZ_4M, -- cgit v1.2.3 From f605a263e0690177ecc180417eacf2b5507dd177 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 28 Feb 2020 11:34:52 -0500 Subject: dax, pmem: Add a dax operation zero_page_range Add a dax operation zero_page_range, to zero a page. This will also clear any known poison in the page being zeroed. As of now, zeroing of one page is allowed in a single call. There are no callers which are trying to zero more than a page in a single call. Once we grow the callers which zero more than a page in single call, we can add that support. Primary reason for not doing that yet is that this will add little complexity in dm implementation where a range might be spanning multiple underlying targets and one will have to split the range into multiple sub ranges and call zero_page_range() on individual targets. Suggested-by: Christoph Hellwig Signed-off-by: Vivek Goyal Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20200228163456.1587-3-vgoyal@redhat.com Signed-off-by: Dan Williams --- include/linux/dax.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 328c2dbb4409..71735c430c05 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -34,6 +34,8 @@ struct dax_operations { /* copy_to_iter: required operation for fs-dax direct-i/o */ size_t (*copy_to_iter)(struct dax_device *, pgoff_t, void *, size_t, struct iov_iter *); + /* zero_page_range: required operation. Zero page range */ + int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; extern struct attribute_group dax_attribute_group; @@ -199,6 +201,8 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, + size_t nr_pages); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, -- cgit v1.2.3 From cdf6cdcd3b99a99ea9ecc1b05d1d040d5a69a134 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 28 Feb 2020 11:34:54 -0500 Subject: dm,dax: Add dax zero_page_range operation This patch adds support for dax zero_page_range operation to dm targets. Signed-off-by: Vivek Goyal Acked-by: Mike Snitzer Link: https://lore.kernel.org/r/20200228163456.1587-5-vgoyal@redhat.com Signed-off-by: Dan Williams --- include/linux/device-mapper.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 475668c69dbc..af48d9da3916 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -141,6 +141,8 @@ typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); typedef size_t (*dm_dax_copy_iter_fn)(struct dm_target *ti, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); +typedef int (*dm_dax_zero_page_range_fn)(struct dm_target *ti, pgoff_t pgoff, + size_t nr_pages); #define PAGE_SECTORS (PAGE_SIZE / 512) void dm_error(const char *message); @@ -195,6 +197,7 @@ struct target_type { dm_dax_direct_access_fn direct_access; dm_dax_copy_iter_fn dax_copy_from_iter; dm_dax_copy_iter_fn dax_copy_to_iter; + dm_dax_zero_page_range_fn dax_zero_page_range; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3 From 4f3b4f161d7a070d2181dbcf7fbd97c7631d5c24 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 28 Feb 2020 11:34:56 -0500 Subject: dax,iomap: Add helper dax_iomap_zero() to zero a range Add a helper dax_ioamp_zero() to zero a range. This patch basically merges __dax_zero_page_range() and iomap_dax_zero(). Suggested-by: Christoph Hellwig Signed-off-by: Vivek Goyal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20200228163456.1587-7-vgoyal@redhat.com Signed-off-by: Dan Williams --- include/linux/dax.h | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dax.h b/include/linux/dax.h index 71735c430c05..d7af5d243f24 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -13,6 +13,7 @@ typedef unsigned long dax_entry_t; struct iomap_ops; +struct iomap; struct dax_device; struct dax_operations { /* @@ -214,20 +215,8 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); - -#ifdef CONFIG_FS_DAX -int __dax_zero_page_range(struct block_device *bdev, - struct dax_device *dax_dev, sector_t sector, - unsigned int offset, unsigned int length); -#else -static inline int __dax_zero_page_range(struct block_device *bdev, - struct dax_device *dax_dev, sector_t sector, - unsigned int offset, unsigned int length) -{ - return -ENXIO; -} -#endif - +int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, + struct iomap *iomap); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); -- cgit v1.2.3