From 1b862aecfbd419cdc4553645bf86d07554279bed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:45 -0700 Subject: mm, memory_hotplug: get rid of is_zone_device_section Device memory hotplug hooks into regular memory hotplug only half way. It needs memory sections to track struct pages but there is no need/desire to associate those sections with memory blocks and export them to the userspace via sysfs because they cannot be onlined anyway. This is currently expressed by for_device argument to arch_add_memory which then makes sure to associate the given memory range with ZONE_DEVICE. register_new_memory then relies on is_zone_device_section to distinguish special memory hotplug from the regular one. While this works now, later patches in this series want to move __add_zone outside of arch_add_memory path so we have to come up with something else. Add want_memblock down the __add_pages path and use it to control whether the section->memblock association should be done. arch_add_memory then just trivially want memblock for everything but for_device hotplug. remove_memory_section doesn't need is_zone_device_section either. We can simply skip all the memblock specific cleanup if there is no memblock for the given section. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-5-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 134a2f69c21a..3c8cf86201c3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -111,7 +111,7 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, /* reasonably generic interface to expand the physical pages in a zone */ extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); + unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); -- cgit v1.2.3 From 2d070eab2e8270c8a84d480bb91e4f739315f03d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:56 -0700 Subject: mm: consider zone which is not fully populated to have holes __pageblock_pfn_to_page has two users currently, set_zone_contiguous which checks whether the given zone contains holes and pageblock_pfn_to_page which then carefully returns a first valid page from the given pfn range for the given zone. This doesn't handle zones which are not fully populated though. Memory pageblocks can be offlined or might not have been onlined yet. In such a case the zone should be considered to have holes otherwise pfn walkers can touch and play with offline pages. Current callers of pageblock_pfn_to_page in compaction seem to work properly right now because they only isolate PageBuddy (isolate_freepages_block) or PageLRU resp. __PageMovable (isolate_migratepages_block) which will be always false for these pages. It would be safer to skip these pages altogether, though. In order to do this patch adds a new memory section state (SECTION_IS_ONLINE) which is set in memory_present (during boot time) or in online_pages_range during the memory hotplug. Similarly offline_mem_sections clears the bit and it is called when the memory range is offlined. pfn_to_online_page helper is then added which check the mem section and only returns a page if it is onlined already. Use the new helper in __pageblock_pfn_to_page and skip the whole page block in such a case. [mhocko@suse.com: check valid section number in pfn_to_online_page (Vlastimil), mark sections online after all struct pages are initialized in online_pages_range (Vlastimil)] Link: http://lkml.kernel.org/r/20170518164210.GD18333@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170515085827.16474-8-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3c8cf86201c3..a61aede1b391 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -14,6 +14,20 @@ struct memory_block; struct resource; #ifdef CONFIG_MEMORY_HOTPLUG +/* + * Return page for the valid pfn only if the page is online. All pfn + * walkers which rely on the fully initialized page->flags and others + * should use this rather than pfn_valid && pfn_to_page + */ +#define pfn_to_online_page(pfn) \ +({ \ + struct page *___page = NULL; \ + unsigned long ___nr = pfn_to_section_nr(pfn); \ + \ + if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr))\ + ___page = pfn_to_page(pfn); \ + ___page; \ +}) /* * Types for free bootmem stored in page->lru.next. These have to be in @@ -203,6 +217,14 @@ extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); #else /* ! CONFIG_MEMORY_HOTPLUG */ +#define pfn_to_online_page(pfn) \ +({ \ + struct page *___page = NULL; \ + if (pfn_valid(pfn)) \ + ___page = pfn_to_page(pfn); \ + ___page; \ + }) + /* * Stub functions for when hotplug is off */ -- cgit v1.2.3 From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:11 -0700 Subject: mm, memory_hotplug: do not associate hotadded memory to zones until online The current memory hotplug implementation relies on having all the struct pages associate with a zone/node during the physical hotplug phase (arch_add_memory->__add_pages->__add_section->__add_zone). In the vast majority of cases this means that they are added to ZONE_NORMAL. This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory hotadd without sparsemem") and it wasn't a big deal back then because movable onlining didn't exist yet. Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable memory and portion memory") and then things got more complicated. Rather than reconsidering the zone association which was no longer needed (because the memory hotplug already depended on SPARSEMEM) a convoluted semantic of zone shifting has been developed. Only the currently last memblock or the one adjacent to the zone_movable can be onlined movable. This essentially means that the online type changes as the new memblocks are added. Let's simulate memory hot online manually $ echo 0x100000000 > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory32/valid_zones Normal Movable $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal Movable $ echo online_movable > /sys/devices/system/memory/memory34/state $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Normal This is an awkward semantic because an udev event is sent as soon as the block is onlined and an udev handler might want to online it based on some policy (e.g. association with a node) but it will inherently race with new blocks showing up. This patch changes the physical online phase to not associate pages with any zone at all. All the pages are just marked reserved and wait for the onlining phase to be associated with the zone as per the online request. There are only two requirements - existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap - ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses the latter one is not an inherent requirement and can be changed in the future. It preserves the current behavior and made the code slightly simpler. This is subject to change in future. This means that the same physical online steps as above will lead to the following state: Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Implementation: The current move_pfn_range is reimplemented to check the above requirements (allow_online_pfn_range) and then updates the respective zone (move_pfn_range_to_zone), the pgdat and links all the pages in the pfn range with the zone/node. __add_pages is updated to not require the zone and only initializes sections in the range. This allowed to simplify the arch_add_memory code (s390 could get rid of quite some of code). devm_memremap_pages is the only user of arch_add_memory which relies on the zone association because it only hooks into the memory hotplug only half way. It uses it to associate the new memory with ZONE_DEVICE but doesn't allow it to be {on,off}lined via sysfs. This means that this particular code path has to call move_pfn_range_to_zone explicitly. The original zone shifting code is kept in place and will be removed in the follow up patch for an easier review. Please note that this patch also changes the original behavior when offlining a memory block adjacent to another zone (Normal vs. Movable) used to allow to change its movable type. This will be handled later. [richard.weiyang@gmail.com: simplify zone_intersects()] Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com [richard.weiyang@gmail.com: remove duplicate call for set_page_links] Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com [akpm@linux-foundation.org: remove unused local `i'] Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Wei Yang Tested-by: Dan Williams Tested-by: Reza Arbab Acked-by: Heiko Carstens # For s390 bits Acked-by: Vlastimil Babka Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a61aede1b391..8a07a49fd8dc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ -/* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, +/* reasonably generic interface to expand the physical pages */ +extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA @@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); +extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift); - +extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, + int online_type); #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3 From c246a213f5bad687c6c2cea27d7265eaf8f6f5d7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:18 -0700 Subject: mm, memory_hotplug: do not assume ZONE_NORMAL is default kernel zone Heiko Carstens has noticed that he can generate overlapping zones for ZONE_DMA and ZONE_NORMAL: DMA [mem 0x0000000000000000-0x000000007fffffff] Normal [mem 0x0000000080000000-0x000000017fffffff] $ cat /sys/devices/system/memory/block_size_bytes 10000000 $ cat /sys/devices/system/memory/memory5/valid_zones DMA $ echo 0 > /sys/devices/system/memory/memory5/online $ cat /sys/devices/system/memory/memory5/valid_zones Normal $ echo 1 > /sys/devices/system/memory/memory5/online Normal $ cat /proc/zoneinfo Node 0, zone DMA spanned 524288 <----- present 458752 managed 455078 start_pfn: 0 <----- Node 0, zone Normal spanned 720896 present 589824 managed 571648 start_pfn: 327680 <----- The reason is that we assume that the default zone for kernel onlining is ZONE_NORMAL. This was a simplification introduced by the memory hotplug rework and it is easily fixable by checking the range overlap in the zone order and considering the first matching zone as the default one. If there is no such zone then assume ZONE_NORMAL as we have been doing so far. Fixes: "mm, memory_hotplug: do not associate hotadded memory to zones until online" Link: http://lkml.kernel.org/r/20170601083746.4924-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Heiko Carstens Tested-by: Heiko Carstens Acked-by: Vlastimil Babka Cc: Dan Williams Cc: Reza Arbab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8a07a49fd8dc..4d65a2fcac15 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -311,4 +311,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type); +extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, + unsigned long nr_pages); #endif /* __LINUX_MEMORY_HOTPLUG_H */ -- cgit v1.2.3 From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:21 -0700 Subject: mm, memory_hotplug: replace for_device by want_memblock in arch_add_memory arch_add_memory gets for_device argument which then controls whether we want to create memblocks for created memory sections. Simplify the logic by telling whether we want memblocks directly rather than going through pointless negation. This also makes the api easier to understand because it is clear what we want rather than nothing telling for_device which can mean anything. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4d65a2fcac15..780c806e17d3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -298,7 +298,7 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); -extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); -- cgit v1.2.3 From 559bfc7d1beff814a8e9999d102bf1157ef1f010 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:28 -0700 Subject: mm, memory_hotplug: remove unused cruft after memory hotplug rework zone_for_memory doesn't have any user anymore as well as the whole zone shifting infrastructure so drop them all. This shouldn't introduce any functional changes. Link: http://lkml.kernel.org/r/20170515085827.16474-15-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 780c806e17d3..ed167541e4fc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -296,8 +296,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); -extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, - bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); -- cgit v1.2.3 From 4932381ee2a77a21641009149722e1bb92bd99e2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:41:05 -0700 Subject: mm, memory_hotplug: move movable_node to the hotplug proper movable_node_is_enabled is defined in memblock proper while it is initialized from the memory hotplug proper. This is quite messy and it makes a dependency between the two so move movable_node along with the helper functions to memory_hotplug. To make it more entertaining the kernel parameter is ignored unless CONFIG_HAVE_MEMBLOCK_NODE_MAP=y because we do not have the node information for each memblock otherwise. So let's warn when the option is disabled. Link: http://lkml.kernel.org/r/20170529114141.536-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Reza Arbab Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Jerome Glisse Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Kani Toshimitsu Cc: Chen Yucong Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Cc: Daniel Kiper Cc: Igor Mammedov Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/memory_hotplug.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ed167541e4fc..c8a5056a5ae0 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -115,6 +115,12 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); extern bool memhp_auto_online; +/* If movable_node boot option specified */ +extern bool movable_node_enabled; +static inline bool movable_node_is_enabled(void) +{ + return movable_node_enabled; +} #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); @@ -266,6 +272,10 @@ static inline void put_online_mems(void) {} static inline void mem_hotplug_begin(void) {} static inline void mem_hotplug_done(void) {} +static inline bool movable_node_is_enabled(void) +{ + return false; +} #endif /* ! CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE -- cgit v1.2.3