From 1811582587c43bdf13d690d83345610d4df433bb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:46 -0700 Subject: drivers/base/memory: pass a block_id to init_memory_block() We'll rework hotplug_memory_register() shortly, so it no longer consumes pass a section. [cai@lca.pw: fix a compilation warning] Link: http://lkml.kernel.org/r/1559320186-28337-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20190527111152.16324-6-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Qian Cai Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Alex Deucher Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Oscar Salvador Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f180427e48f4..e0aa7f9abb36 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -651,21 +651,18 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(struct memory_block **memory, - struct mem_section *section, unsigned long state) +static int init_memory_block(struct memory_block **memory, int block_id, + unsigned long state) { struct memory_block *mem; unsigned long start_pfn; - int scn_nr; int ret = 0; mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; - scn_nr = __section_nr(section); - mem->start_section_nr = - base_memory_block_id(scn_nr) * sections_per_block; + mem->start_section_nr = block_id * sections_per_block; mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; mem->state = state; start_pfn = section_nr_to_pfn(mem->start_section_nr); @@ -680,21 +677,18 @@ static int init_memory_block(struct memory_block **memory, static int add_memory_block(int base_section_nr) { struct memory_block *mem; - int i, ret, section_count = 0, section_nr; + int i, ret, section_count = 0; for (i = base_section_nr; i < base_section_nr + sections_per_block; - i++) { - if (!present_section_nr(i)) - continue; - if (section_count == 0) - section_nr = i; - section_count++; - } + i++) + if (present_section_nr(i)) + section_count++; if (section_count == 0) return 0; - ret = init_memory_block(&mem, __nr_to_section(section_nr), MEM_ONLINE); + ret = init_memory_block(&mem, base_memory_block_id(base_section_nr), + MEM_ONLINE); if (ret) return ret; mem->section_count = section_count; @@ -707,6 +701,7 @@ static int add_memory_block(int base_section_nr) */ int hotplug_memory_register(int nid, struct mem_section *section) { + int block_id = base_memory_block_id(__section_nr(section)); int ret = 0; struct memory_block *mem; @@ -717,7 +712,7 @@ int hotplug_memory_register(int nid, struct mem_section *section) mem->section_count++; put_device(&mem->dev); } else { - ret = init_memory_block(&mem, section, MEM_OFFLINE); + ret = init_memory_block(&mem, block_id, MEM_OFFLINE); if (ret) goto out; mem->section_count++; -- cgit v1.2.3 From 80ec922dbd87fd38d15719c86a94457204648aeb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:51 -0700 Subject: mm/memory_hotplug: allow arch_remove_memory() without CONFIG_MEMORY_HOTREMOVE We want to improve error handling while adding memory by allowing to use arch_remove_memory() and __remove_pages() even if CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like: arch_add_memory() rc = do_something(); if (rc) { arch_remove_memory(); } We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require quite some dependencies for memory offlining. Link: http://lkml.kernel.org/r/20190527111152.16324-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Pavel Tatashin Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: David Hildenbrand Cc: Oscar Salvador Cc: "Kirill A. Shutemov" Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: Christophe Leroy Cc: Nicholas Piggin Cc: Vasily Gorbik Cc: Rob Herring Cc: Masahiro Yamada Cc: "mike.travis@hpe.com" Cc: Andrew Banman Cc: Arun KS Cc: Qian Cai Cc: Mathieu Malaterre Cc: Baoquan He Cc: Logan Gunthorpe Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Chintan Pandya Cc: Dan Williams Cc: Ingo Molnar Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: Mark Rutland Cc: Mike Rapoport Cc: Oscar Salvador Cc: Robin Murphy Cc: Wei Yang Cc: Will Deacon Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e0aa7f9abb36..92459d6f12be 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -723,7 +723,6 @@ out: return ret; } -#ifdef CONFIG_MEMORY_HOTREMOVE static void unregister_memory(struct memory_block *memory) { @@ -762,7 +761,6 @@ void unregister_memory_section(struct mem_section *section) out_unlock: mutex_unlock(&mem_sysfs_mutex); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ /* return true if the memory block is offlined, otherwise, return false */ bool is_memblock_offlined(struct memory_block *mem) -- cgit v1.2.3 From db051a0dac13db24d58470d75cee0ce7c6b031a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:56:56 -0700 Subject: mm/memory_hotplug: create memory block devices after arch_add_memory() Only memory to be added to the buddy and to be onlined/offlined by user space using /sys/devices/system/memory/... needs (and should have!) memory block devices. Factor out creation of memory block devices. Create all devices after arch_add_memory() succeeded. We can later drop the want_memblock parameter, because it is now effectively stale. Only after memory block devices have been added, memory can be onlined by user space. This implies, that memory is not visible to user space at all before arch_add_memory() succeeded. While at it - use WARN_ON_ONCE instead of BUG_ON in moved unregister_memory() - introduce find_memory_block_by_id() to search via block id - Use find_memory_block_by_id() in init_memory_block() to catch duplicates Link: http://lkml.kernel.org/r/20190527111152.16324-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: "mike.travis@hpe.com" Cc: Ingo Molnar Cc: Andrew Banman Cc: Oscar Salvador Cc: Qian Cai Cc: Wei Yang Cc: Arun KS Cc: Mathieu Malaterre Cc: Alex Deucher Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Chris Wilson Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Jonathan Cameron Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Brown Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 82 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 28 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 92459d6f12be..18a30c3ac0ef 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -39,6 +39,11 @@ static inline int base_memory_block_id(int section_nr) return section_nr / sections_per_block; } +static inline int pfn_to_block_id(unsigned long pfn) +{ + return base_memory_block_id(pfn_to_section_nr(pfn)); +} + static int memory_subsys_online(struct device *dev); static int memory_subsys_offline(struct device *dev); @@ -582,10 +587,9 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) * A reference for the returned object is held and the reference for the * hinted object is released. */ -struct memory_block *find_memory_block_hinted(struct mem_section *section, - struct memory_block *hint) +static struct memory_block *find_memory_block_by_id(int block_id, + struct memory_block *hint) { - int block_id = base_memory_block_id(__section_nr(section)); struct device *hintdev = hint ? &hint->dev : NULL; struct device *dev; @@ -597,6 +601,14 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section, return to_memory_block(dev); } +struct memory_block *find_memory_block_hinted(struct mem_section *section, + struct memory_block *hint) +{ + int block_id = base_memory_block_id(__section_nr(section)); + + return find_memory_block_by_id(block_id, hint); +} + /* * For now, we have a linear search to go find the appropriate * memory_block corresponding to a particular phys_index. If @@ -658,6 +670,11 @@ static int init_memory_block(struct memory_block **memory, int block_id, unsigned long start_pfn; int ret = 0; + mem = find_memory_block_by_id(block_id, NULL); + if (mem) { + put_device(&mem->dev); + return -EEXIST; + } mem = kzalloc(sizeof(*mem), GFP_KERNEL); if (!mem) return -ENOMEM; @@ -695,44 +712,53 @@ static int add_memory_block(int base_section_nr) return 0; } +static void unregister_memory(struct memory_block *memory) +{ + if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys)) + return; + + /* drop the ref. we got via find_memory_block() */ + put_device(&memory->dev); + device_unregister(&memory->dev); +} + /* - * need an interface for the VM to add new memory regions, - * but without onlining it. + * Create memory block devices for the given memory area. Start and size + * have to be aligned to memory block granularity. Memory block devices + * will be initialized as offline. */ -int hotplug_memory_register(int nid, struct mem_section *section) +int create_memory_block_devices(unsigned long start, unsigned long size) { - int block_id = base_memory_block_id(__section_nr(section)); - int ret = 0; + const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); + int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); struct memory_block *mem; + unsigned long block_id; + int ret = 0; - mutex_lock(&mem_sysfs_mutex); + if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes()))) + return -EINVAL; - mem = find_memory_block(section); - if (mem) { - mem->section_count++; - put_device(&mem->dev); - } else { + mutex_lock(&mem_sysfs_mutex); + for (block_id = start_block_id; block_id != end_block_id; block_id++) { ret = init_memory_block(&mem, block_id, MEM_OFFLINE); if (ret) - goto out; - mem->section_count++; + break; + mem->section_count = sections_per_block; + } + if (ret) { + end_block_id = block_id; + for (block_id = start_block_id; block_id != end_block_id; + block_id++) { + mem = find_memory_block_by_id(block_id, NULL); + mem->section_count = 0; + unregister_memory(mem); + } } - -out: mutex_unlock(&mem_sysfs_mutex); return ret; } -static void -unregister_memory(struct memory_block *memory) -{ - BUG_ON(memory->dev.bus != &memory_subsys); - - /* drop the ref. we got via find_memory_block() */ - put_device(&memory->dev); - device_unregister(&memory->dev); -} - void unregister_memory_section(struct mem_section *section) { struct memory_block *mem; -- cgit v1.2.3 From 4c4b7f9ba9486c565aead99a198ceeef73ae81f6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:06 -0700 Subject: mm/memory_hotplug: remove memory block devices before arch_remove_memory() Let's factor out removing of memory block devices, which is only necessary for memory added via add_memory() and friends that created memory block devices. Remove the devices before calling arch_remove_memory(). This finishes factoring out memory block device handling from arch_add_memory() and arch_remove_memory(). Link: http://lkml.kernel.org/r/20190527111152.16324-10-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: "mike.travis@hpe.com" Cc: Andrew Banman Cc: Ingo Molnar Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: Oscar Salvador Cc: Jonathan Cameron Cc: Arun KS Cc: Mathieu Malaterre Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Dave Hansen Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Rutland Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Wei Yang Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 37 ++++++++++++++++++------------------- drivers/base/node.c | 11 ++++++----- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 18a30c3ac0ef..826dd76f662e 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -759,32 +759,31 @@ int create_memory_block_devices(unsigned long start, unsigned long size) return ret; } -void unregister_memory_section(struct mem_section *section) +/* + * Remove memory block devices for the given memory area. Start and size + * have to be aligned to memory block granularity. Memory block devices + * have to be offline. + */ +void remove_memory_block_devices(unsigned long start, unsigned long size) { + const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); + const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); struct memory_block *mem; + int block_id; - if (WARN_ON_ONCE(!present_section(section))) + if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes()))) return; mutex_lock(&mem_sysfs_mutex); - - /* - * Some users of the memory hotplug do not want/need memblock to - * track all sections. Skip over those. - */ - mem = find_memory_block(section); - if (!mem) - goto out_unlock; - - unregister_mem_sect_under_nodes(mem, __section_nr(section)); - - mem->section_count--; - if (mem->section_count == 0) + for (block_id = start_block_id; block_id != end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id, NULL); + if (WARN_ON_ONCE(!mem)) + continue; + mem->section_count = 0; + unregister_memory_block_under_nodes(mem); unregister_memory(mem); - else - put_device(&mem->dev); - -out_unlock: + } mutex_unlock(&mem_sysfs_mutex); } diff --git a/drivers/base/node.c b/drivers/base/node.c index aa878fbcf705..0b0f38c2c7cd 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -802,9 +802,10 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) return 0; } -/* unregister memory section under all nodes that it spans */ -int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, - unsigned long phys_index) +/* + * Unregister memory block device under all nodes that it spans. + */ +int unregister_memory_block_under_nodes(struct memory_block *mem_blk) { NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL); unsigned long pfn, sect_start_pfn, sect_end_pfn; @@ -817,8 +818,8 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, return -ENOMEM; nodes_clear(*unlinked_nodes); - sect_start_pfn = section_nr_to_pfn(phys_index); - sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1; + sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); + sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { int nid; -- cgit v1.2.3 From a31b264c2b415b29660da0bc2ba291a98629ce51 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:12 -0700 Subject: mm/memory_hotplug: make unregister_memory_block_under_nodes() never fail We really don't want anything during memory hotunplug to fail. We always pass a valid memory block device, that check can go. Avoid allocating memory and eventually failing. As we are always called under lock, we can use a static piece of memory. This avoids having to put the structure onto the stack, having to guess about the stack size of callers. Patch inspired by a patch from Oscar Salvador. In the future, there might be no need to iterate over nodes at all. mem->nid should tell us exactly what to remove. Memory block devices with mixed nodes (added during boot) should properly fenced off and never removed. Link: http://lkml.kernel.org/r/20190527111152.16324-11-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Alex Deucher Cc: "David S. Miller" Cc: Mark Brown Cc: Chris Wilson Cc: David Hildenbrand Cc: Jonathan Cameron Cc: Andrew Banman Cc: Andy Lutomirski Cc: Anshuman Khandual Cc: Ard Biesheuvel Cc: Arun KS Cc: Baoquan He Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Chintan Pandya Cc: Christophe Leroy Cc: Dan Williams Cc: Dave Hansen Cc: Fenghua Yu Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Jun Yao Cc: "Kirill A. Shutemov" Cc: Logan Gunthorpe Cc: Mark Rutland Cc: Masahiro Yamada Cc: Mathieu Malaterre Cc: Michael Ellerman Cc: Mike Rapoport Cc: "mike.travis@hpe.com" Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Pavel Tatashin Cc: Peter Zijlstra Cc: Qian Cai Cc: Rich Felker Cc: Rob Herring Cc: Robin Murphy Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/base/node.c b/drivers/base/node.c index 0b0f38c2c7cd..beec80649b33 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -804,20 +804,14 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) /* * Unregister memory block device under all nodes that it spans. + * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). */ -int unregister_memory_block_under_nodes(struct memory_block *mem_blk) +void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { - NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL); unsigned long pfn, sect_start_pfn, sect_end_pfn; + static nodemask_t unlinked_nodes; - if (!mem_blk) { - NODEMASK_FREE(unlinked_nodes); - return -EFAULT; - } - if (!unlinked_nodes) - return -ENOMEM; - nodes_clear(*unlinked_nodes); - + nodes_clear(unlinked_nodes); sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { @@ -828,15 +822,13 @@ int unregister_memory_block_under_nodes(struct memory_block *mem_blk) continue; if (!node_online(nid)) continue; - if (node_test_and_set(nid, *unlinked_nodes)) + if (node_test_and_set(nid, unlinked_nodes)) continue; sysfs_remove_link(&node_devices[nid]->dev.kobj, kobject_name(&mem_blk->dev.kobj)); sysfs_remove_link(&mem_blk->dev.kobj, kobject_name(&node_devices[nid]->dev.kobj)); } - NODEMASK_FREE(unlinked_nodes); - return 0; } int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) -- cgit v1.2.3 From 2491f0a2c0b117b9097e9c9eee0c21f2e5f716d7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:37 -0700 Subject: mm: section numbers use the type "unsigned long" Patch series "mm: Further memory block device cleanups", v1. Some further cleanups around memory block devices. Especially, clean up and simplify walk_memory_range(). Including some other minor cleanups. This patch (of 6): We are using a mixture of "int" and "unsigned long". Let's make this consistent by using "unsigned long" everywhere. We'll do the same with memory block ids next. While at it, turn the "unsigned long i" in removable_show() into an int - sections_per_block is an int. [akpm@linux-foundation.org: s/unsigned long i/unsigned long nr/] [david@redhat.com: v3] Link: http://lkml.kernel.org/r/20190620183139.4352-2-david@redhat.com Link: http://lkml.kernel.org/r/20190614100114.311-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Vlastimil Babka Cc: Michal Hocko Cc: Dan Williams Cc: Mel Gorman Cc: Wei Yang Cc: Johannes Weiner Cc: Arun KS Cc: Pavel Tatashin Cc: Oscar Salvador Cc: Stephen Rothwell Cc: Mike Rapoport Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 826dd76f662e..5947b5a5686d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -34,7 +34,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex); static int sections_per_block; -static inline int base_memory_block_id(int section_nr) +static inline int base_memory_block_id(unsigned long section_nr) { return section_nr / sections_per_block; } @@ -131,9 +131,9 @@ static ssize_t phys_index_show(struct device *dev, static ssize_t removable_show(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned long i, pfn; - int ret = 1; struct memory_block *mem = to_memory_block(dev); + unsigned long pfn; + int ret = 1, i; if (mem->state != MEM_ONLINE) goto out; @@ -691,15 +691,15 @@ static int init_memory_block(struct memory_block **memory, int block_id, return ret; } -static int add_memory_block(int base_section_nr) +static int add_memory_block(unsigned long base_section_nr) { + int ret, section_count = 0; struct memory_block *mem; - int i, ret, section_count = 0; + unsigned long nr; - for (i = base_section_nr; - i < base_section_nr + sections_per_block; - i++) - if (present_section_nr(i)) + for (nr = base_section_nr; nr < base_section_nr + sections_per_block; + nr++) + if (present_section_nr(nr)) section_count++; if (section_count == 0) @@ -822,10 +822,9 @@ static const struct attribute_group *memory_root_attr_groups[] = { */ int __init memory_dev_init(void) { - unsigned int i; int ret; int err; - unsigned long block_sz; + unsigned long block_sz, nr; ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); if (ret) @@ -839,9 +838,9 @@ int __init memory_dev_init(void) * during boot and have been initialized */ mutex_lock(&mem_sysfs_mutex); - for (i = 0; i <= __highest_present_section_nr; - i += sections_per_block) { - err = add_memory_block(i); + for (nr = 0; nr <= __highest_present_section_nr; + nr += sections_per_block) { + err = add_memory_block(nr); if (!ret) ret = err; } -- cgit v1.2.3 From 90ec010fe0d690665852d6bac21643e9ae7affd8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:40 -0700 Subject: drivers/base/memory: use "unsigned long" for block ids Block ids are just shifted section numbers, so let's also use "unsigned long" for them, too. Link: http://lkml.kernel.org/r/20190614100114.311-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 5947b5a5686d..c54e80fd25a8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -34,12 +34,12 @@ static DEFINE_MUTEX(mem_sysfs_mutex); static int sections_per_block; -static inline int base_memory_block_id(unsigned long section_nr) +static inline unsigned long base_memory_block_id(unsigned long section_nr) { return section_nr / sections_per_block; } -static inline int pfn_to_block_id(unsigned long pfn) +static inline unsigned long pfn_to_block_id(unsigned long pfn) { return base_memory_block_id(pfn_to_section_nr(pfn)); } @@ -587,7 +587,7 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) * A reference for the returned object is held and the reference for the * hinted object is released. */ -static struct memory_block *find_memory_block_by_id(int block_id, +static struct memory_block *find_memory_block_by_id(unsigned long block_id, struct memory_block *hint) { struct device *hintdev = hint ? &hint->dev : NULL; @@ -604,7 +604,7 @@ static struct memory_block *find_memory_block_by_id(int block_id, struct memory_block *find_memory_block_hinted(struct mem_section *section, struct memory_block *hint) { - int block_id = base_memory_block_id(__section_nr(section)); + unsigned long block_id = base_memory_block_id(__section_nr(section)); return find_memory_block_by_id(block_id, hint); } @@ -663,8 +663,8 @@ int register_memory(struct memory_block *memory) return ret; } -static int init_memory_block(struct memory_block **memory, int block_id, - unsigned long state) +static int init_memory_block(struct memory_block **memory, + unsigned long block_id, unsigned long state) { struct memory_block *mem; unsigned long start_pfn; @@ -729,8 +729,8 @@ static void unregister_memory(struct memory_block *memory) */ int create_memory_block_devices(unsigned long start, unsigned long size) { - const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); - int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); + unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); struct memory_block *mem; unsigned long block_id; int ret = 0; @@ -766,10 +766,10 @@ int create_memory_block_devices(unsigned long start, unsigned long size) */ void remove_memory_block_devices(unsigned long start, unsigned long size) { - const int start_block_id = pfn_to_block_id(PFN_DOWN(start)); - const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); + const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); + const unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size)); struct memory_block *mem; - int block_id; + unsigned long block_id; if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) || !IS_ALIGNED(size, memory_block_size_bytes()))) -- cgit v1.2.3 From 8d595c4c0f768f19db043d378b22e98405f9fd47 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:43 -0700 Subject: mm: make register_mem_sect_under_node() static It is only used internally. Link: http://lkml.kernel.org/r/20190614100114.311-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Keith Busch Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/base/node.c b/drivers/base/node.c index beec80649b33..27391f1e8f60 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -753,7 +753,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn) } /* register memory section under specified node if it spans that node */ -int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) +static int register_mem_sect_under_node(struct memory_block *mem_blk, + void *arg) { int ret, nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn; -- cgit v1.2.3 From fbcf73ce65827c3d8935f38b832a43153a0c78d1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:46 -0700 Subject: mm/memory_hotplug: rename walk_memory_range() and pass start+size instead of pfns walk_memory_range() was once used to iterate over sections. Now, it iterates over memory blocks. Rename the function, fixup the documentation. Also, pass start+size instead of PFNs, which is what most callers already have at hand. (we'll rework link_mem_sections() most probably soon) Follow-up patches will rework, simplify, and move walk_memory_blocks() to drivers/base/memory.c. Note: walk_memory_blocks() only works correctly right now if the start_pfn is aligned to a section start. This is the case right now, but we'll generalize the function in a follow up patch so the semantics match the documentation. [akpm@linux-foundation.org: remove unused variable] Link: http://lkml.kernel.org/r/20190614100114.311-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: David Hildenbrand Cc: Rashmica Gupta Cc: Pavel Tatashin Cc: Anshuman Khandual Cc: Michael Neuling Cc: Thomas Gleixner Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Juergen Gross Cc: Qian Cai Cc: Arun KS Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/acpi/acpi_memhotplug.c | 19 ++++--------------- drivers/base/node.c | 5 +++-- 2 files changed, 7 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index db013dc21c02..e294f44a7850 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device) return 0; } -static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info) -{ - return PFN_DOWN(info->start_addr); -} - -static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info) -{ - return PFN_UP(info->start_addr + info->length-1); -} - static int acpi_bind_memblk(struct memory_block *mem, void *arg) { return acpi_bind_one(&mem->dev, arg); @@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void *arg) static int acpi_bind_memory_blocks(struct acpi_memory_info *info, struct acpi_device *adev) { - return walk_memory_range(acpi_meminfo_start_pfn(info), - acpi_meminfo_end_pfn(info), adev, - acpi_bind_memblk); + return walk_memory_blocks(info->start_addr, info->length, adev, + acpi_bind_memblk); } static int acpi_unbind_memblk(struct memory_block *mem, void *arg) @@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, void *arg) static void acpi_unbind_memory_blocks(struct acpi_memory_info *info) { - walk_memory_range(acpi_meminfo_start_pfn(info), - acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk); + walk_memory_blocks(info->start_addr, info->length, NULL, + acpi_unbind_memblk); } static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) diff --git a/drivers/base/node.c b/drivers/base/node.c index 27391f1e8f60..75b7e6f6535b 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -834,8 +834,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) { - return walk_memory_range(start_pfn, end_pfn, (void *)&nid, - register_mem_sect_under_node); + return walk_memory_blocks(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), (void *)&nid, + register_mem_sect_under_node); } #ifdef CONFIG_HUGETLBFS -- cgit v1.2.3 From ea8846411ad686ff626e00bb2c3821b3db2ab56a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:50 -0700 Subject: mm/memory_hotplug: move and simplify walk_memory_blocks() Let's move walk_memory_blocks() to the place where memory block logic resides and simplify it. While at it, add a type for the callback function. Link: http://lkml.kernel.org/r/20190614100114.311-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Andrew Banman Cc: Mike Travis Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Arun KS Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c54e80fd25a8..0204384b4d1d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long pfn) return base_memory_block_id(pfn_to_section_nr(pfn)); } +static inline unsigned long phys_to_block_id(unsigned long phys) +{ + return pfn_to_block_id(PFN_DOWN(phys)); +} + static int memory_subsys_online(struct device *dev); static int memory_subsys_offline(struct device *dev); @@ -851,3 +856,40 @@ out: printk(KERN_ERR "%s() failed: %d\n", __func__, ret); return ret; } + +/** + * walk_memory_blocks - walk through all present memory blocks overlapped + * by the range [start, start + size) + * + * @start: start address of the memory range + * @size: size of the memory range + * @arg: argument passed to func + * @func: callback for each memory section walked + * + * This function walks through all present memory blocks overlapped by the + * range [start, start + size), calling func on each memory block. + * + * In case func() returns an error, walking is aborted and the error is + * returned. + */ +int walk_memory_blocks(unsigned long start, unsigned long size, + void *arg, walk_memory_blocks_func_t func) +{ + const unsigned long start_block_id = phys_to_block_id(start); + const unsigned long end_block_id = phys_to_block_id(start + size - 1); + struct memory_block *mem; + unsigned long block_id; + int ret = 0; + + for (block_id = start_block_id; block_id <= end_block_id; block_id++) { + mem = find_memory_block_by_id(block_id, NULL); + if (!mem) + continue; + + ret = func(mem, arg); + put_device(&mem->dev); + if (ret) + break; + } + return ret; +} -- cgit v1.2.3 From dd625285910d3cff535fa76355e49949513918a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 18 Jul 2019 15:57:53 -0700 Subject: drivers/base/memory.c: get rid of find_memory_block_hinted() No longer needed, let's remove it. Also, drop the "hint" parameter completely from "find_memory_block_by_id", as nobody needs it anymore. [david@redhat.com: v3] Link: http://lkml.kernel.org/r/20190620183139.4352-7-david@redhat.com [david@redhat.com: handle zero-length walks] Link: http://lkml.kernel.org/r/1c2edc22-afd7-2211-c4c7-40e54e5007e8@redhat.com Link: http://lkml.kernel.org/r/20190614100114.311-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Tested-by: Qian Cai Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: David Hildenbrand Cc: Stephen Rothwell Cc: Pavel Tatashin Cc: Andrew Banman Cc: Mike Travis Cc: Oscar Salvador Cc: Michal Hocko Cc: Wei Yang Cc: Arun KS Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 0204384b4d1d..20c39d1bcef8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -588,30 +588,13 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn) return 0; } -/* - * A reference for the returned object is held and the reference for the - * hinted object is released. - */ -static struct memory_block *find_memory_block_by_id(unsigned long block_id, - struct memory_block *hint) +/* A reference for the returned memory block device is acquired. */ +static struct memory_block *find_memory_block_by_id(unsigned long block_id) { - struct device *hintdev = hint ? &hint->dev : NULL; struct device *dev; - dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev); - if (hint) - put_device(&hint->dev); - if (!dev) - return NULL; - return to_memory_block(dev); -} - -struct memory_block *find_memory_block_hinted(struct mem_section *section, - struct memory_block *hint) -{ - unsigned long block_id = base_memory_block_id(__section_nr(section)); - - return find_memory_block_by_id(block_id, hint); + dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL); + return dev ? to_memory_block(dev) : NULL; } /* @@ -624,7 +607,9 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section, */ struct memory_block *find_memory_block(struct mem_section *section) { - return find_memory_block_hinted(section, NULL); + unsigned long block_id = base_memory_block_id(__section_nr(section)); + + return find_memory_block_by_id(block_id); } static struct attribute *memory_memblk_attrs[] = { @@ -675,7 +660,7 @@ static int init_memory_block(struct memory_block **memory, unsigned long start_pfn; int ret = 0; - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); if (mem) { put_device(&mem->dev); return -EEXIST; @@ -755,7 +740,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size) end_block_id = block_id; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); mem->section_count = 0; unregister_memory(mem); } @@ -782,7 +767,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mutex_lock(&mem_sysfs_mutex); for (block_id = start_block_id; block_id != end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; mem->section_count = 0; @@ -881,8 +866,11 @@ int walk_memory_blocks(unsigned long start, unsigned long size, unsigned long block_id; int ret = 0; + if (!size) + return 0; + for (block_id = start_block_id; block_id <= end_block_id; block_id++) { - mem = find_memory_block_by_id(block_id, NULL); + mem = find_memory_block_by_id(block_id); if (!mem) continue; -- cgit v1.2.3 From 7e3e888dfc138089f4c15a81b418e88f0978f744 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:36 -0700 Subject: libnvdimm/pfn: fix fsdax-mode namespace info-block zero-fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At namespace creation time there is the potential for the "expected to be zero" fields of a 'pfn' info-block to be filled with indeterminate data. While the kernel buffer is zeroed on allocation it is immediately overwritten by nd_pfn_validate() filling it with the current contents of the on-media info-block location. For fields like, 'flags' and the 'padding' it potentially means that future implementations can not rely on those fields being zero. In preparation to stop using the 'start_pad' and 'end_trunc' fields for section alignment, arrange for fields that are not explicitly initialized to be guaranteed zero. Bump the minor version to indicate it is safe to assume the 'padding' and 'flags' are zero. Otherwise, this corruption is expected to benign since all other critical fields are explicitly initialized. Note The cc: stable is about spreading this new policy to as many kernels as possible not fixing an issue in those kernels. It is not until the change titled "libnvdimm/pfn: Stop padding pmem namespaces to section alignment" where this improper initialization becomes a problem. So if someone decides to backport "libnvdimm/pfn: Stop padding pmem namespaces to section alignment" (which is not tagged for stable), make sure this pre-requisite is flagged. Link: http://lkml.kernel.org/r/156092356065.979959.6681003754765958296.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 32ab0a3f5170 ("libnvdimm, pmem: 'struct page' for pmem") Signed-off-by: Dan Williams Tested-by: Aneesh Kumar K.V [ppc64] Cc: Cc: David Hildenbrand Cc: Jane Chu Cc: Jeff Moyer Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Logan Gunthorpe Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/nvdimm/dax_devs.c | 2 +- drivers/nvdimm/pfn.h | 1 + drivers/nvdimm/pfn_devs.c | 18 +++++++++++++++--- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/dax_devs.c b/drivers/nvdimm/dax_devs.c index 49fc18ee0565..6d22b0f83b3b 100644 --- a/drivers/nvdimm/dax_devs.c +++ b/drivers/nvdimm/dax_devs.c @@ -118,7 +118,7 @@ int nd_dax_probe(struct device *dev, struct nd_namespace_common *ndns) nvdimm_bus_unlock(&ndns->dev); if (!dax_dev) return -ENOMEM; - pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); + pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn, DAX_SIG); dev_dbg(dev, "dax: %s\n", rc == 0 ? dev_name(dax_dev) : ""); diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h index f58b849e455b..dfb2bcda8f5a 100644 --- a/drivers/nvdimm/pfn.h +++ b/drivers/nvdimm/pfn.h @@ -28,6 +28,7 @@ struct nd_pfn_sb { __le32 end_trunc; /* minor-version-2 record the base alignment of the mapping */ __le32 align; + /* minor-version-3 guarantee the padding and flags are zero */ u8 padding[4000]; __le64 checksum; }; diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 55fb6b7433ed..06f465c0baf3 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -412,6 +412,15 @@ static int nd_pfn_clear_memmap_errors(struct nd_pfn *nd_pfn) return 0; } +/** + * nd_pfn_validate - read and validate info-block + * @nd_pfn: fsdax namespace runtime state / properties + * @sig: 'devdax' or 'fsdax' signature + * + * Upon return the info-block buffer contents (->pfn_sb) are + * indeterminate when validation fails, and a coherent info-block + * otherwise. + */ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig) { u64 checksum, offset; @@ -557,7 +566,7 @@ int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns) nvdimm_bus_unlock(&ndns->dev); if (!pfn_dev) return -ENOMEM; - pfn_sb = devm_kzalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); + pfn_sb = devm_kmalloc(dev, sizeof(*pfn_sb), GFP_KERNEL); nd_pfn = to_nd_pfn(pfn_dev); nd_pfn->pfn_sb = pfn_sb; rc = nd_pfn_validate(nd_pfn, PFN_SIG); @@ -693,7 +702,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) u64 checksum; int rc; - pfn_sb = devm_kzalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL); + pfn_sb = devm_kmalloc(&nd_pfn->dev, sizeof(*pfn_sb), GFP_KERNEL); if (!pfn_sb) return -ENOMEM; @@ -702,11 +711,14 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) sig = DAX_SIG; else sig = PFN_SIG; + rc = nd_pfn_validate(nd_pfn, sig); if (rc != -ENODEV) return rc; /* no info block, do init */; + memset(pfn_sb, 0, sizeof(*pfn_sb)); + nd_region = to_nd_region(nd_pfn->dev.parent); if (nd_region->ro) { dev_info(&nd_pfn->dev, @@ -759,7 +771,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); - pfn_sb->version_minor = cpu_to_le16(2); + pfn_sb->version_minor = cpu_to_le16(3); pfn_sb->start_pad = cpu_to_le32(start_pad); pfn_sb->end_trunc = cpu_to_le32(end_trunc); pfn_sb->align = cpu_to_le32(nd_pfn->align); -- cgit v1.2.3 From a3619190d62ed9d66416891be2416f6bea2b3ca4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Jul 2019 15:58:40 -0700 Subject: libnvdimm/pfn: stop padding pmem namespaces to section alignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the mm core supports section-unaligned hotplug of ZONE_DEVICE memory, we no longer need to add padding at pfn/dax device creation time. The kernel will still honor padding established by older kernels. Link: http://lkml.kernel.org/r/156092356588.979959.6793371748950931916.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Jeff Moyer Tested-by: Aneesh Kumar K.V [ppc64] Cc: David Hildenbrand Cc: Jane Chu Cc: Jérôme Glisse Cc: Jonathan Corbet Cc: Logan Gunthorpe Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Toshi Kani Cc: Vlastimil Babka Cc: Wei Yang Cc: Jason Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/nvdimm/pfn.h | 14 --------- drivers/nvdimm/pfn_devs.c | 77 ++++++++--------------------------------------- 2 files changed, 13 insertions(+), 78 deletions(-) (limited to 'drivers') diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h index dfb2bcda8f5a..7381673b7b70 100644 --- a/drivers/nvdimm/pfn.h +++ b/drivers/nvdimm/pfn.h @@ -33,18 +33,4 @@ struct nd_pfn_sb { __le64 checksum; }; -#ifdef CONFIG_SPARSEMEM -#define PFN_SECTION_ALIGN_DOWN(x) SECTION_ALIGN_DOWN(x) -#define PFN_SECTION_ALIGN_UP(x) SECTION_ALIGN_UP(x) -#else -/* - * In this case ZONE_DEVICE=n and we will disable 'pfn' device support, - * but we still want pmem to compile. - */ -#define PFN_SECTION_ALIGN_DOWN(x) (x) -#define PFN_SECTION_ALIGN_UP(x) (x) -#endif - -#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x))) -#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x))) #endif /* __NVDIMM_PFN_H */ diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 06f465c0baf3..df2bdbd22450 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -587,14 +587,14 @@ static u32 info_block_reserve(void) } /* - * We hotplug memory at section granularity, pad the reserved area from - * the previous section base to the namespace base address. + * We hotplug memory at sub-section granularity, pad the reserved area + * from the previous section base to the namespace base address. */ static unsigned long init_altmap_base(resource_size_t base) { unsigned long base_pfn = PHYS_PFN(base); - return PFN_SECTION_ALIGN_DOWN(base_pfn); + return SUBSECTION_ALIGN_DOWN(base_pfn); } static unsigned long init_altmap_reserve(resource_size_t base) @@ -602,7 +602,7 @@ static unsigned long init_altmap_reserve(resource_size_t base) unsigned long reserve = info_block_reserve() >> PAGE_SHIFT; unsigned long base_pfn = PHYS_PFN(base); - reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn); + reserve += base_pfn - SUBSECTION_ALIGN_DOWN(base_pfn); return reserve; } @@ -632,8 +632,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); } else if (nd_pfn->mode == PFN_MODE_PMEM) { - nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) - - offset) / PAGE_SIZE); + nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset)); if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) dev_info(&nd_pfn->dev, "number of pfns truncated from %lld to %ld\n", @@ -649,54 +648,14 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) return 0; } -static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys) -{ - return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys), - ALIGN_DOWN(phys, nd_pfn->align)); -} - -/* - * Check if pmem collides with 'System RAM', or other regions when - * section aligned. Trim it accordingly. - */ -static void trim_pfn_device(struct nd_pfn *nd_pfn, u32 *start_pad, u32 *end_trunc) -{ - struct nd_namespace_common *ndns = nd_pfn->ndns; - struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); - struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent); - const resource_size_t start = nsio->res.start; - const resource_size_t end = start + resource_size(&nsio->res); - resource_size_t adjust, size; - - *start_pad = 0; - *end_trunc = 0; - - adjust = start - PHYS_SECTION_ALIGN_DOWN(start); - size = resource_size(&nsio->res) + adjust; - if (region_intersects(start - adjust, size, IORESOURCE_SYSTEM_RAM, - IORES_DESC_NONE) == REGION_MIXED - || nd_region_conflict(nd_region, start - adjust, size)) - *start_pad = PHYS_SECTION_ALIGN_UP(start) - start; - - /* Now check that end of the range does not collide. */ - adjust = PHYS_SECTION_ALIGN_UP(end) - end; - size = resource_size(&nsio->res) + adjust; - if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, - IORES_DESC_NONE) == REGION_MIXED - || !IS_ALIGNED(end, nd_pfn->align) - || nd_region_conflict(nd_region, start, size)) - *end_trunc = end - phys_pmem_align_down(nd_pfn, end); -} - static int nd_pfn_init(struct nd_pfn *nd_pfn) { struct nd_namespace_common *ndns = nd_pfn->ndns; struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); - u32 start_pad, end_trunc, reserve = info_block_reserve(); resource_size_t start, size; struct nd_region *nd_region; + unsigned long npfns, align; struct nd_pfn_sb *pfn_sb; - unsigned long npfns; phys_addr_t offset; const char *sig; u64 checksum; @@ -727,43 +686,35 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) return -ENXIO; } - memset(pfn_sb, 0, sizeof(*pfn_sb)); - - trim_pfn_device(nd_pfn, &start_pad, &end_trunc); - if (start_pad + end_trunc) - dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n", - dev_name(&ndns->dev), start_pad + end_trunc); - /* * Note, we use 64 here for the standard size of struct page, * debugging options may cause it to be larger in which case the * implementation will limit the pfns advertised through * ->direct_access() to those that are included in the memmap. */ - start = nsio->res.start + start_pad; + start = nsio->res.start; size = resource_size(&nsio->res); - npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - reserve) - / PAGE_SIZE); + npfns = PHYS_PFN(size - SZ_8K); + align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT)); if (nd_pfn->mode == PFN_MODE_PMEM) { /* * The altmap should be padded out to the block size used * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. */ - offset = ALIGN(start + reserve + 64 * npfns, - max(nd_pfn->align, PMD_SIZE)) - start; + offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start; } else if (nd_pfn->mode == PFN_MODE_RAM) - offset = ALIGN(start + reserve, nd_pfn->align) - start; + offset = ALIGN(start + SZ_8K, align) - start; else return -ENXIO; - if (offset + start_pad + end_trunc >= size) { + if (offset >= size) { dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n", dev_name(&ndns->dev)); return -ENXIO; } - npfns = (size - offset - start_pad - end_trunc) / SZ_4K; + npfns = PHYS_PFN(size - offset); pfn_sb->mode = cpu_to_le32(nd_pfn->mode); pfn_sb->dataoff = cpu_to_le64(offset); pfn_sb->npfns = cpu_to_le64(npfns); @@ -772,8 +723,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); pfn_sb->version_major = cpu_to_le16(1); pfn_sb->version_minor = cpu_to_le16(3); - pfn_sb->start_pad = cpu_to_le32(start_pad); - pfn_sb->end_trunc = cpu_to_le32(end_trunc); pfn_sb->align = cpu_to_le32(nd_pfn->align); checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); pfn_sb->checksum = cpu_to_le64(checksum); -- cgit v1.2.3 From eec4844fae7c033a0c1fc1eb3b8517aeb8b6cc49 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 18 Jul 2019 15:58:50 -0700 Subject: proc/sysctl: add shared variables for range check In the sysctl code the proc_dointvec_minmax() function is often used to validate the user supplied value between an allowed range. This function uses the extra1 and extra2 members from struct ctl_table as minimum and maximum allowed value. On sysctl handler declaration, in every source file there are some readonly variables containing just an integer which address is assigned to the extra1 and extra2 members, so the sysctl range is enforced. The special values 0, 1 and INT_MAX are very often used as range boundary, leading duplication of variables like zero=0, one=1, int_max=INT_MAX in different source files: $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l 248 Add a const int array containing the most commonly used values, some macros to refer more easily to the correct array member, and use them instead of creating a local one for every object file. This is the bloat-o-meter output comparing the old and new binary compiled with the default Fedora config: # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164) Data old new delta sysctl_vals - 12 +12 __kstrtab_sysctl_vals - 12 +12 max 14 10 -4 int_max 16 - -16 one 68 - -68 zero 128 28 -100 Total: Before=20583249, After=20583085, chg -0.00% [mcroce@redhat.com: tipc: remove two unused variables] Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com [akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c] [arnd@arndb.de: proc/sysctl: make firmware loader table conditional] Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de [akpm@linux-foundation.org: fix fs/eventpoll.c] Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com Signed-off-by: Matteo Croce Signed-off-by: Arnd Bergmann Acked-by: Kees Cook Reviewed-by: Aaron Tomlin Cc: Matthew Wilcox Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/firmware_loader/fallback_table.c | 13 ++++++------- drivers/gpu/drm/i915/i915_perf.c | 8 +++----- drivers/hv/vmbus_drv.c | 6 ++---- drivers/tty/tty_ldisc.c | 6 ++---- drivers/xen/balloon.c | 7 ++----- 5 files changed, 15 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 776dd69cf5be..ba9d30b28edc 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -16,9 +16,6 @@ * firmware fallback configuration table */ -static unsigned int zero; -static unsigned int one = 1; - struct firmware_fallback_config fw_fallback_config = { .force_sysfs_fallback = IS_ENABLED(CONFIG_FW_LOADER_USER_HELPER_FALLBACK), .loading_timeout = 60, @@ -26,6 +23,7 @@ struct firmware_fallback_config fw_fallback_config = { }; EXPORT_SYMBOL_GPL(fw_fallback_config); +#ifdef CONFIG_SYSCTL struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", @@ -33,8 +31,8 @@ struct ctl_table firmware_config_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ignore_sysfs_fallback", @@ -42,9 +40,10 @@ struct ctl_table firmware_config_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; EXPORT_SYMBOL_GPL(firmware_config_table); +#endif diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 3d8162d28730..a700c5c3d167 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -274,8 +274,6 @@ #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY) /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */ -static int zero; -static int one = 1; static u32 i915_perf_stream_paranoid = true; /* The maximum exponent the hardware accepts is 63 (essentially it selects one @@ -3366,8 +3364,8 @@ static struct ctl_table oa_table[] = { .maxlen = sizeof(i915_perf_stream_paranoid), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "oa_max_sample_rate", @@ -3375,7 +3373,7 @@ static struct ctl_table oa_table[] = { .maxlen = sizeof(i915_oa_max_sample_rate), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, + .extra1 = SYSCTL_ZERO, .extra2 = &oa_sample_rate_hard_limit, }, {} diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 894da5abdc55..ebd35fc35290 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1197,8 +1197,6 @@ static struct kmsg_dumper hv_kmsg_dumper = { }; static struct ctl_table_header *hv_ctl_table_hdr; -static int zero; -static int one = 1; /* * sysctl option to allow the user to control whether kmsg data should be @@ -1211,8 +1209,8 @@ static struct ctl_table hv_ctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE }, {} }; diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c index fde8d4073e74..4c49f53afa3e 100644 --- a/drivers/tty/tty_ldisc.c +++ b/drivers/tty/tty_ldisc.c @@ -855,8 +855,6 @@ void tty_ldisc_deinit(struct tty_struct *tty) tty->ldisc = NULL; } -static int zero; -static int one = 1; static struct ctl_table tty_table[] = { { .procname = "ldisc_autoload", @@ -864,8 +862,8 @@ static struct ctl_table tty_table[] = { .maxlen = sizeof(tty_ldisc_autoload), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index d37dd5bb7a8f..37a36c6b9f93 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -77,9 +77,6 @@ static int xen_hotplug_unpopulated; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -static int zero; -static int one = 1; - static struct ctl_table balloon_table[] = { { .procname = "hotplug_unpopulated", @@ -87,8 +84,8 @@ static struct ctl_table balloon_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { } }; -- cgit v1.2.3