From 956f8b445061667c3545baa24778f890d1d522f4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:16 -0700 Subject: drivers/base/memory: rename MMOP_ONLINE_KEEP to MMOP_ONLINE Patch series "mm/memory_hotplug: allow to specify a default online_type", v3. Distributions nowadays use udev rules ([1] [2]) to specify if and how to online hotplugged memory. The rules seem to get more complex with many special cases. Due to the various special cases, CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE cannot be used. All memory hotplug is handled via udev rules. Every time we hotplug memory, the udev rule will come to the same conclusion. Especially Hyper-V (but also soon virtio-mem) add a lot of memory in separate memory blocks and wait for memory to get onlined by user space before continuing to add more memory blocks (to not add memory faster than it is getting onlined). This of course slows down the whole memory hotplug process. To make the job of distributions easier and to avoid udev rules that get more and more complicated, let's extend the mechanism provided by - /sys/devices/system/memory/auto_online_blocks - "memhp_default_state=" on the kernel cmdline to be able to specify also "online_movable" as well as "online_kernel" === Example /usr/libexec/config-memhotplug === #!/bin/bash VIRT=`systemd-detect-virt --vm` ARCH=`uname -p` sense_virtio_mem() { if [ -d "/sys/bus/virtio/drivers/virtio_mem/" ]; then DEVICES=`find /sys/bus/virtio/drivers/virtio_mem/ -maxdepth 1 -type l | wc -l` if [ $DEVICES != "0" ]; then return 0 fi fi return 1 } if [ ! -e "/sys/devices/system/memory/auto_online_blocks" ]; then echo "Memory hotplug configuration support missing in the kernel" exit 1 fi if grep "memhp_default_state=" /proc/cmdline > /dev/null; then echo "Memory hotplug configuration overridden in kernel cmdline (memhp_default_state=)" exit 1 fi if [ $VIRT == "microsoft" ]; then echo "Detected Hyper-V on $ARCH" # Hyper-V wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif sense_virtio_mem; then echo "Detected virtio-mem on $ARCH" # virtio-mem wants all memory in ZONE_NORMAL ONLINE_TYPE="online_kernel" elif [ $ARCH == "s390x" ] || [ $ARCH == "s390" ]; then echo "Detected $ARCH" # standby memory should not be onlined automatically ONLINE_TYPE="offline" elif [ $ARCH == "ppc64" ] || [ $ARCH == "ppc64le" ]; then echo "Detected" $ARCH # PPC64 onlines all hotplugged memory right from the kernel ONLINE_TYPE="offline" elif [ $VIRT == "none" ]; then echo "Detected bare-metal on $ARCH" # Bare metal users expect hotplugged memory to be unpluggable. We assume # that ZONE imbalances on such enterpise servers cannot happen and is # properly documented ONLINE_TYPE="online_movable" else # TODO: Hypervisors that want to unplug DIMMs and can guarantee that ZONE # imbalances won't happen echo "Detected $VIRT on $ARCH" # Usually, ballooning is used in virtual environments, so memory should go to # ZONE_NORMAL. However, sometimes "movable_node" is relevant. ONLINE_TYPE="online" fi echo "Selected online_type:" $ONLINE_TYPE # Configure what to do with memory that will be hotplugged in the future echo $ONLINE_TYPE 2>/dev/null > /sys/devices/system/memory/auto_online_blocks if [ $? != "0" ]; then echo "Memory hotplug cannot be configured (e.g., old kernel or missing permissions)" # A backup udev rule should handle old kernels if necessary exit 1 fi # Process all already pluggedd blocks (e.g., DIMMs, but also Hyper-V or virtio-mem) if [ $ONLINE_TYPE != "offline" ]; then for MEMORY in /sys/devices/system/memory/memory*; do STATE=`cat $MEMORY/state` if [ $STATE == "offline" ]; then echo $ONLINE_TYPE > $MEMORY/state fi done fi === Example /usr/lib/systemd/system/config-memhotplug.service === [Unit] Description=Configure memory hotplug behavior DefaultDependencies=no Conflicts=shutdown.target Before=sysinit.target shutdown.target After=systemd-modules-load.service ConditionPathExists=|/sys/devices/system/memory/auto_online_blocks [Service] ExecStart=/usr/libexec/config-memhotplug Type=oneshot TimeoutSec=0 RemainAfterExit=yes [Install] WantedBy=sysinit.target === Example modification to the 40-redhat.rules [2] === : diff --git a/40-redhat.rules b/40-redhat.rules-new : index 2c690e5..168fd03 100644 : --- a/40-redhat.rules : +++ b/40-redhat.rules-new : @@ -6,6 +6,9 @@ SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online} : # Memory hotadd request : SUBSYSTEM!="memory", GOTO="memory_hotplug_end" : ACTION!="add", GOTO="memory_hotplug_end" : +# memory hotplug behavior configured : +PROGRAM=="grep online /sys/devices/system/memory/auto_online_blocks", GOTO="memory_hotplug_end" : + : PROGRAM="/bin/uname -p", RESULT=="s390*", GOTO="memory_hotplug_end" : : ENV{.state}="online" === [1] https://github.com/lnykryn/systemd-rhel/pull/281 [2] https://github.com/lnykryn/systemd-rhel/blob/staging/rules/40-redhat.rules This patch (of 8): The name is misleading and it's not really clear what is "kept". Let's just name it like the online_type name we expose to user space ("online"). Add some documentation to the types. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Vitaly Kuznetsov Cc: Yumei Huang Cc: Igor Mammedov Cc: Eduardo Habkost Cc: Benjamin Herrenschmidt Cc: Haiyang Zhang Cc: K. Y. Srinivasan Cc: Michael Ellerman (powerpc) Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Wei Liu Link: http://lkml.kernel.org/r/20200319131221.14044-1-david@redhat.com Link: http://lkml.kernel.org/r/20200317104942.11178-2-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f4d59155f3d4..261dbf010d5d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -47,9 +47,13 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { + /* Offline the memory. */ MMOP_OFFLINE = -1, - MMOP_ONLINE_KEEP, + /* Online the memory. Zone depends, see default_zone_for_pfn(). */ + MMOP_ONLINE, + /* Online the memory to ZONE_NORMAL. */ MMOP_ONLINE_KERNEL, + /* Online the memory to ZONE_MOVABLE. */ MMOP_ONLINE_MOVABLE, }; -- cgit v1.2.3 From efc978ad0e05ed6401c7854811750bf55b67f4b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:20 -0700 Subject: drivers/base/memory: map MMOP_OFFLINE to 0 Historically, we used the value -1. Just treat 0 as the special case now. Clarify a comment (which was wrong, when we come via device_online() the first time, the online_type would have been 0 / MEM_ONLINE). The default is now always MMOP_OFFLINE. This removes the last user of the manual "-1", which didn't use the enum value. This is a preparation to use the online_type as an array index. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-3-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 261dbf010d5d..c2e06ed5e0e9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -48,7 +48,7 @@ enum { /* Types for control the zone type of onlined and offlined memory */ enum { /* Offline the memory. */ - MMOP_OFFLINE = -1, + MMOP_OFFLINE = 0, /* Online the memory. Zone depends, see default_zone_for_pfn(). */ MMOP_ONLINE, /* Online the memory to ZONE_NORMAL. */ -- cgit v1.2.3 From 862919e568356cc36288a11b42cd88ec3a7100e9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:40 -0700 Subject: mm/memory_hotplug: convert memhp_auto_online to store an online_type ... and rename it to memhp_default_online_type. This is a preparation for more detailed default online behavior. Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-8-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c2e06ed5e0e9..c6e090b34c4b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,7 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; -extern bool memhp_auto_online; +/* Default online_type (MMOP_*) when new memory blocks are added. */ +extern int memhp_default_online_type; /* If movable_node boot option specified */ extern bool movable_node_enabled; static inline bool movable_node_is_enabled(void) -- cgit v1.2.3 From 5f47adf762b78cae97de58d9ff01d2d44db09467 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 6 Apr 2020 20:07:44 -0700 Subject: mm/memory_hotplug: allow to specify a default online_type For now, distributions implement advanced udev rules to essentially - Don't online any hotplugged memory (s390x) - Online all memory to ZONE_NORMAL (e.g., most virt environments like hyperv) - Online all memory to ZONE_MOVABLE in case the zone imbalance is taken care of (e.g., bare metal, special virt environments) In summary: All memory is usually onlined the same way, however, the kernel always has to ask user space to come up with the same answer. E.g., Hyper-V always waits for a memory block to get onlined before continuing, otherwise it might end up adding memory faster than onlining it, which can result in strange OOM situations. This waiting slows down adding of a bigger amount of memory. Let's allow to specify a default online_type, not just "online" and "offline". This allows distributions to configure the default online_type when booting up and be done with it. We can now specify "offline", "online", "online_movable" and "online_kernel" via - "memhp_default_state=" on the kernel cmdline - /sys/devices/system/memory/auto_online_blocks just like we are able to specify for a single memory block via /sys/devices/system/memory/memoryX/state Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton Reviewed-by: Wei Yang Reviewed-by: Baoquan He Acked-by: Michal Hocko Acked-by: Pankaj Gupta Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Cc: Wei Yang Cc: Benjamin Herrenschmidt Cc: Eduardo Habkost Cc: Haiyang Zhang Cc: Igor Mammedov Cc: "K. Y. Srinivasan" Cc: Michael Ellerman Cc: Paul Mackerras Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Yumei Huang Link: http://lkml.kernel.org/r/20200317104942.11178-9-david@redhat.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c6e090b34c4b..ef55115320fb 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -117,6 +117,8 @@ extern int arch_add_memory(int nid, u64 start, u64 size, struct mhp_restrictions *restrictions); extern u64 max_mem_size; +extern int memhp_online_type_from_str(const char *str); + /* Default online_type (MMOP_*) when new memory blocks are added. */ extern int memhp_default_online_type; /* If movable_node boot option specified */ -- cgit v1.2.3 From 96c6b598135e7cec66161e8943823470c7c8954e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 10 Apr 2020 14:33:17 -0700 Subject: mm/memory_hotplug: drop the flags field from struct mhp_restrictions Patch series "Allow setting caching mode in arch_add_memory() for P2PDMA", v4. Currently, the page tables created using memremap_pages() are always created with the PAGE_KERNEL cacheing mode. However, the P2PDMA code is creating pages for PCI BAR memory which should never be accessed through the cache and instead use either WC or UC. This still works in most cases, on x86, because the MTRR registers typically override the caching settings in the page tables for all of the IO memory to be UC-. However, this tends not to work so well on other arches or some rare x86 machines that have firmware which does not setup the MTRR registers in this way. Instead of this, this series proposes a change to arch_add_memory() to take the pgprot required by the mapping which allows us to explicitly set pagetable entries for P2PDMA memory to UC. This changes is pretty routine for most of the arches: x86_64, arm64 and powerpc simply need to thread the pgprot through to where the page tables are setup. x86_32 unfortunately sets up the page tables at boot so must use _set_memory_prot() to change their caching mode. ia64, s390 and sh don't appear to have an easy way to change the page tables so, for now at least, we just return -EINVAL on such mappings and thus they will not support P2PDMA memory until the work for this is done. This should be fine as they don't yet support ZONE_DEVICE. This patch (of 7): This variable is not used anywhere and should therefore be removed from the structure. Signed-off-by: Logan Gunthorpe Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Dan Williams Acked-by: Michal Hocko Cc: Christoph Hellwig Cc: Catalin Marinas Cc: Will Deacon Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Eric Badger Cc: "H. Peter Anvin" Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Paul Mackerras Link: http://lkml.kernel.org/r/20200306170846.9333-2-logang@deltatee.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ef55115320fb..7c1bcff11672 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -59,11 +59,9 @@ enum { /* * Restrictions for the memory hotplug: - * flags: MHP_ flags * altmap: alternative allocator for memmap array */ struct mhp_restrictions { - unsigned long flags; struct vmem_altmap *altmap; }; -- cgit v1.2.3 From f5637d3b42ab0465ef71d5fb8461bce97fba95e8 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 10 Apr 2020 14:33:21 -0700 Subject: mm/memory_hotplug: rename mhp_restrictions to mhp_params The mhp_restrictions struct really doesn't specify anything resembling a restriction anymore so rename it to be mhp_params as it is a list of extended parameters. Signed-off-by: Logan Gunthorpe Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Reviewed-by: Dan Williams Acked-by: Michal Hocko Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Eric Badger Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20200306170846.9333-3-logang@deltatee.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 7c1bcff11672..75f0f6304735 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -58,10 +58,10 @@ enum { }; /* - * Restrictions for the memory hotplug: - * altmap: alternative allocator for memmap array + * Extended parameters for memory hotplug: + * altmap: alternative allocator for memmap array (optional) */ -struct mhp_restrictions { +struct mhp_params { struct vmem_altmap *altmap; }; @@ -112,7 +112,7 @@ extern int restore_online_page_callback(online_page_callback_t callback); extern int try_online_node(int nid); extern int arch_add_memory(int nid, u64 start, u64 size, - struct mhp_restrictions *restrictions); + struct mhp_params *params); extern u64 max_mem_size; extern int memhp_online_type_from_str(const char *str); @@ -133,17 +133,17 @@ extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages, /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct mhp_restrictions *restrictions); + struct mhp_params *params); #ifndef CONFIG_ARCH_HAS_ADD_PAGES static inline int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct mhp_restrictions *restrictions) + unsigned long nr_pages, struct mhp_params *params) { - return __add_pages(nid, start_pfn, nr_pages, restrictions); + return __add_pages(nid, start_pfn, nr_pages, params); } #else /* ARCH_HAS_ADD_PAGES */ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct mhp_restrictions *restrictions); + struct mhp_params *params); #endif /* ARCH_HAS_ADD_PAGES */ #ifdef CONFIG_NUMA -- cgit v1.2.3 From bfeb022f8fe4c5afdcfd7a3d868fac9765f9bcad Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 10 Apr 2020 14:33:36 -0700 Subject: mm/memory_hotplug: add pgprot_t to mhp_params devm_memremap_pages() is currently used by the PCI P2PDMA code to create struct page mappings for IO memory. At present, these mappings are created with PAGE_KERNEL which implies setting the PAT bits to be WB. However, on x86, an mtrr register will typically override this and force the cache type to be UC-. In the case firmware doesn't set this register it is effectively WB and will typically result in a machine check exception when it's accessed. Other arches are not currently likely to function correctly seeing they don't have any MTRR registers to fall back on. To solve this, provide a way to specify the pgprot value explicitly to arch_add_memory(). Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a simple change to pass the pgprot_t down to their respective functions which set up the page tables. For x86_32, set the page tables explicitly using _set_memory_prot() (seeing they are already mapped). For ia64, s390 and sh, reject anything but PAGE_KERNEL settings -- this should be fine, for now, seeing these architectures don't support ZONE_DEVICE. A check in __add_pages() is also added to ensure the pgprot parameter was set for all arches. Signed-off-by: Logan Gunthorpe Signed-off-by: Andrew Morton Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Dan Williams Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Eric Badger Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20200306170846.9333-7-logang@deltatee.com Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 75f0f6304735..93d9ada74ddd 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -60,9 +60,12 @@ enum { /* * Extended parameters for memory hotplug: * altmap: alternative allocator for memmap array (optional) + * pgprot: page protection flags to apply to newly created page tables + * (required) */ struct mhp_params { struct vmem_altmap *altmap; + pgprot_t pgprot; }; /* -- cgit v1.2.3