From 18fa84a2db0e15b02baa5d94bdb5bd509175d2f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 29 May 2019 13:46:25 -0700 Subject: cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css() A PF_EXITING task can stay associated with an offline css. If such task calls task_get_css(), it can get stuck indefinitely. This can be triggered by BSD process accounting which writes to a file with PF_EXITING set when racing against memcg disable as in the backtrace at the end. After this change, task_get_css() may return a css which was already offline when the function was called. None of the existing users are affected by this change. INFO: rcu_sched self-detected stall on CPU INFO: rcu_sched detected stalls on CPUs/tasks: ... NMI backtrace for cpu 0 ... Call Trace: dump_stack+0x46/0x68 nmi_cpu_backtrace.cold.2+0x13/0x57 nmi_trigger_cpumask_backtrace+0xba/0xca rcu_dump_cpu_stacks+0x9e/0xce rcu_check_callbacks.cold.74+0x2af/0x433 update_process_times+0x28/0x60 tick_sched_timer+0x34/0x70 __hrtimer_run_queues+0xee/0x250 hrtimer_interrupt+0xf4/0x210 smp_apic_timer_interrupt+0x56/0x110 apic_timer_interrupt+0xf/0x20 RIP: 0010:balance_dirty_pages_ratelimited+0x28f/0x3d0 ... btrfs_file_write_iter+0x31b/0x563 __vfs_write+0xfa/0x140 __kernel_write+0x4f/0x100 do_acct_process+0x495/0x580 acct_process+0xb9/0xdb do_exit+0x748/0xa00 do_group_exit+0x3a/0xa0 get_signal+0x254/0x560 do_signal+0x23/0x5c0 exit_to_usermode_loop+0x5d/0xa0 prepare_exit_to_usermode+0x53/0x80 retint_user+0x8/0x8 Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.2+ Fixes: ec438699a9ae ("cgroup, block: implement task_get_css() and use it in bio_associate_current()") --- include/linux/cgroup.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0077adeea83..a7e4611e20c8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -487,7 +487,7 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a - * valid css. + * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) @@ -497,7 +497,13 @@ task_get_css(struct task_struct *task, int subsys_id) rcu_read_lock(); while (true) { css = task_css(task, subsys_id); - if (likely(css_tryget_online(css))) + /* + * Can't use css_tryget_online() here. A task which has + * PF_EXITING set may stay associated with an offline css. + * If such task calls this function, css_tryget_online() + * will keep failing. + */ + if (likely(css_tryget(css))) break; cpu_relax(); } -- cgit v1.2.3 From b636fd38dc40113f853337a7d2a6885ad23b8811 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Implement css_task_iter_skip() When a task is moved out of a cset, task iterators pointing to the task are advanced using the normal css_task_iter_advance() call. This is fine but we'll be tracking dying tasks on csets and thus moving tasks from cset->tasks to (to be added) cset->dying_tasks. When we remove a task from cset->tasks, if we advance the iterators, they may move over to the next cset before we had the chance to add the task back on the dying list, which can allow the task to escape iteration. This patch separates out skipping from advancing. Skipping only moves the affected iterators to the next pointer rather than fully advancing it and the following advancing will recognize that the cursor has already been moved forward and do the rest of advancing. This ensures that when a task moves from one list to another in its cset, as long as it moves in the right direction, it's always visible to iteration. This doesn't cause any visible behavior changes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- include/linux/cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a7e4611e20c8..05ed2a209e74 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -43,6 +43,9 @@ /* walk all threaded css_sets in the domain */ #define CSS_TASK_ITER_THREADED (1U << 1) +/* internal flags */ +#define CSS_TASK_ITER_SKIPPED (1U << 16) + /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; -- cgit v1.2.3 From c03cd7738a83b13739f00546166969342c8ff014 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Include dying leaders with live threads in PROCS iterations CSS_TASK_ITER_PROCS currently iterates live group leaders; however, this means that a process with dying leader and live threads will be skipped. IOW, cgroup.procs might be empty while cgroup.threads isn't, which is confusing to say the least. Fix it by making cset track dying tasks and include dying leaders with live threads in PROCS iteration. Signed-off-by: Tejun Heo Reported-and-tested-by: Topi Miettinen Cc: Oleg Nesterov --- include/linux/cgroup-defs.h | 1 + include/linux/cgroup.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 77258d276f93..1615b9c17e02 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -216,6 +216,7 @@ struct css_set { */ struct list_head tasks; struct list_head mg_tasks; + struct list_head dying_tasks; /* all css_task_iters currently walking this cset */ struct list_head task_iters; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 05ed2a209e74..0297f930a56e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -60,6 +60,7 @@ struct css_task_iter { struct list_head *task_pos; struct list_head *tasks_head; struct list_head *mg_tasks_head; + struct list_head *dying_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; -- cgit v1.2.3 From cf8929885de318c0bf73438c9e5dde59d6536f7c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jun 2019 03:35:41 -0600 Subject: cgroup/bfq: revert bfq.weight symlink change There's some discussion on how to do this the best, and Tejun prefers that BFQ just create the file itself instead of having cgroups support a symlink feature. Hence revert commit 54b7b868e826 and 19e9da9e86c4 for 5.2, and this can be done properly for 5.3. Signed-off-by: Jens Axboe --- include/linux/cgroup-defs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index d71b079bb021..11e215d7937e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -106,8 +106,6 @@ enum { CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ - CFTYPE_SYMLINKED = (1 << 6), /* pointed to by symlink too */ - /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ @@ -545,7 +543,6 @@ struct cftype { * end of cftype array. */ char name[MAX_CFTYPE_NAME]; - char link_name[MAX_CFTYPE_NAME]; unsigned long private; /* -- cgit v1.2.3 From 815744d75152078cde5391fc1e3c2d4424323fb6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jun 2019 15:55:46 -0700 Subject: mm: memcontrol: don't batch updates of local VM stats and events The kernel test robot noticed a 26% will-it-scale pagefault regression from commit 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty"). This appears to be caused by bouncing the additional cachelines from the new hierarchical statistics counters. We can fix this by getting rid of the batched local counters instead. Originally, there were *only* group-local counters, and they were fully maintained per cpu. A reader of a stats file high up in the cgroup tree would have to walk the entire subtree and collect each level's per-cpu counters to get the recursive view. This was prohibitively expensive, and so we switched to per-cpu batched updates of the local counters during a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting"), reducing the complexity from nr_subgroups * nr_cpus to nr_subgroups. With growing machines and cgroup trees, the tree walk itself became too expensive for monitoring top-level groups, and this is when the culprit patch added hierarchy counters on each cgroup level. When the per-cpu batch size would be reached, both the local and the hierarchy counters would get batch-updated from the per-cpu delta simultaneously. This makes local and hierarchical counter reads blazingly fast, but it unfortunately makes the write-side too cache line intense. Since local counter reads were never a problem - we only centralized them to accelerate the hierarchy walk - and use of the local counters are becoming rarer due to replacement with hierarchical views (ongoing rework in the page reclaim and workingset code), we can make those local counters unbatched per-cpu counters again. The scheme will then be as such: when a memcg statistic changes, the writer will: - update the local counter (per-cpu) - update the batch counter (per-cpu). If the batch is full: - spill the batch into the group's atomic_t - spill the batch into all ancestors' atomic_ts - empty out the batch counter (per-cpu) when a local memcg counter is read, the reader will: - collect the local counter from all cpus when a hiearchy memcg counter is read, the reader will: - read the atomic_t We might be able to simplify this further and make the recursive counters unbatched per-cpu counters as well (batch upward propagation, but leave per-cpu collection to the readers), but that will require a more in-depth analysis and testing of all the callsites. Deal with the immediate regression for now. Link: http://lkml.kernel.org/r/20190521151647.GB2870@cmpxchg.org Fixes: 42a300353577 ("mm: memcontrol: fix recursive statistics correctness & scalabilty") Signed-off-by: Johannes Weiner Reported-by: kernel test robot Tested-by: kernel test robot Cc: Michal Hocko Cc: Shakeel Butt Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index edf9e8f32d70..1dcb763bb610 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -117,9 +117,12 @@ struct memcg_shrinker_map { struct mem_cgroup_per_node { struct lruvec lruvec; + /* Legacy local VM stats */ + struct lruvec_stat __percpu *lruvec_stat_local; + + /* Subtree VM stats (batched updates) */ struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; - atomic_long_t lruvec_stat_local[NR_VM_NODE_STAT_ITEMS]; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; @@ -265,17 +268,18 @@ struct mem_cgroup { atomic_t moving_account; struct task_struct *move_lock_task; - /* memory.stat */ + /* Legacy local VM stats and events */ + struct memcg_vmstats_percpu __percpu *vmstats_local; + + /* Subtree VM stats and events (batched updates) */ struct memcg_vmstats_percpu __percpu *vmstats_percpu; MEMCG_PADDING(_pad2_); atomic_long_t vmstats[MEMCG_NR_STAT]; - atomic_long_t vmstats_local[MEMCG_NR_STAT]; - atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; - atomic_long_t vmevents_local[NR_VM_EVENT_ITEMS]; + /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; @@ -567,7 +571,11 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = atomic_long_read(&memcg->vmstats_local[idx]); + long x = 0; + int cpu; + + for_each_possible_cpu(cpu) + x += per_cpu(memcg->vmstats_local->stat[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -641,13 +649,15 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long x; + long x = 0; + int cpu; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - x = atomic_long_read(&pn->lruvec_stat_local[idx]); + for_each_possible_cpu(cpu) + x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; -- cgit v1.2.3 From 59ea6d06cfa9247b586a695c21f94afa7183af74 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jun 2019 15:56:11 -0700 Subject: coredump: fix race condition between collapse_huge_page() and core dumping When fixing the race conditions between the coredump and the mmap_sem holders outside the context of the process, we focused on mmget_not_zero()/get_task_mm() callers in 04f5866e41fb70 ("coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping"), but those aren't the only cases where the mmap_sem can be taken outside of the context of the process as Michal Hocko noticed while backporting that commit to older -stable kernels. If mmgrab() is called in the context of the process, but then the mm_count reference is transferred outside the context of the process, that can also be a problem if the mmap_sem has to be taken for writing through that mm_count reference. khugepaged registration calls mmgrab() in the context of the process, but the mmap_sem for writing is taken later in the context of the khugepaged kernel thread. collapse_huge_page() after taking the mmap_sem for writing doesn't modify any vma, so it's not obvious that it could cause a problem to the coredump, but it happens to modify the pmd in a way that breaks an invariant that pmd_trans_huge_lock() relies upon. collapse_huge_page() needs the mmap_sem for writing just to block concurrent page faults that call pmd_trans_huge_lock(). Specifically the invariant that "!pmd_trans_huge()" cannot become a "pmd_trans_huge()" doesn't hold while collapse_huge_page() runs. The coredump will call __get_user_pages() without mmap_sem for reading, which eventually can invoke a lockless page fault which will need a functional pmd_trans_huge_lock(). So collapse_huge_page() needs to use mmget_still_valid() to check it's not running concurrently with the coredump... as long as the coredump can invoke page faults without holding the mmap_sem for reading. This has "Fixes: khugepaged" to facilitate backporting, but in my view it's more a bug in the coredump code that will eventually have to be rewritten to stop invoking page faults without the mmap_sem for reading. So the long term plan is still to drop all mmget_still_valid(). Link: http://lkml.kernel.org/r/20190607161558.32104-1-aarcange@redhat.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Andrea Arcangeli Reported-by: Michal Hocko Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Cc: Oleg Nesterov Cc: Jann Horn Cc: Hugh Dickins Cc: Mike Rapoport Cc: Mike Kravetz Cc: Peter Xu Cc: Jason Gunthorpe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a3fda9f024c3..4a7944078cc3 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -54,6 +54,10 @@ static inline void mmdrop(struct mm_struct *mm) * followed by taking the mmap_sem for writing before modifying the * vmas or anything the coredump pretends not to change from under it. * + * It also has to be called when mmgrab() is used in the context of + * the process, but then the mm_count refcount is transferred outside + * the context of the process to run down_write() on that pinned mm. + * * NOTE: find_extend_vma() called from GUP context is the only place * that can modify the "mm" (notably the vm_start/end) under mmap_sem * for reading and outside the context of the process, so it is also -- cgit v1.2.3 From 2374b682255184d7ef75fcb507ce5af4995ead32 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:18 -0700 Subject: drivers/base/devres: introduce devm_release_action() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/devm_memremap_pages: Fix page release race", v2. Logan audited the devm_memremap_pages() shutdown path and noticed that it was possible to proceed to arch_remove_memory() before all potential page references have been reaped. Introduce a new ->cleanup() callback to do the work of waiting for any straggling page references and then perform the percpu_ref_exit() in devm_memremap_pages_release() context. For p2pdma this involves some deeper reworks to reference count resources on a per-instance basis rather than a per pci-device basis. A modified genalloc api is introduced to convey a driver-private pointer through gen_pool_{alloc,free}() interfaces. Also, a devm_memunmap_pages() api is introduced since p2pdma does not auto-release resources on a setup failure. The dax and pmem changes pass the nvdimm unit tests, and the p2pdma changes should now pass testing with the pci_p2pdma_release() fix. Jrme, how does this look for HMM? This patch (of 6): The devm_add_action() facility allows a resource allocation routine to add custom devm semantics. One such user is devm_memremap_pages(). There is now a need to manually trigger devm_memremap_pages_release(). Introduce devm_release_action() so the release action can be triggered via a new devm_memunmap_pages() api in a follow-on change. Link: http://lkml.kernel.org/r/155727336530.292046.2926860263201336366.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "Jérôme Glisse" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index e85264fb6616..848fc71c6ba6 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -713,6 +713,7 @@ void __iomem *devm_of_iomap(struct device *dev, /* allows to add/remove a custom action to devres stack */ int devm_add_action(struct device *dev, void (*action)(void *), void *data); void devm_remove_action(struct device *dev, void (*action)(void *), void *data); +void devm_release_action(struct device *dev, void (*action)(void *), void *data); static inline int devm_add_action_or_reset(struct device *dev, void (*action)(void *), void *data) -- cgit v1.2.3 From 2e3f139e8ecebf177fe01299285a56855e93fb84 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:21 -0700 Subject: mm/devm_memremap_pages: introduce devm_memunmap_pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the new devm_release_action() facility to allow devm_memremap_pages_release() to be manually triggered. Link: http://lkml.kernel.org/r/155727337088.292046.5774214552136776763.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Jérôme Glisse" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f0628660d541..7601ee314c4a 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -100,6 +100,7 @@ struct dev_pagemap { #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); +void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); @@ -118,6 +119,11 @@ static inline void *devm_memremap_pages(struct device *dev, return ERR_PTR(-ENXIO); } +static inline void devm_memunmap_pages(struct device *dev, + struct dev_pagemap *pgmap) +{ +} + static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap) { -- cgit v1.2.3 From 795ee30648c708502da9df637f83c33361d68dcc Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:27 -0700 Subject: lib/genalloc: introduce chunk owners MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The p2pdma facility enables a provider to publish a pool of dma addresses for a consumer to allocate. A genpool is used internally by p2pdma to collect dma resources, 'chunks', to be handed out to consumers. Whenever a consumer allocates a resource it needs to pin the 'struct dev_pagemap' instance that backs the chunk selected by pci_alloc_p2pmem(). Currently that reference is taken globally on the entire provider device. That sets up a lifetime mismatch whereby the p2pdma core needs to maintain hacks to make sure the percpu_ref is not released twice. This lifetime mismatch also stands in the way of a fix to devm_memremap_pages() whereby devm_memremap_pages_release() must wait for the percpu_ref ->release() callback to complete before it can proceed to teardown pages. So, towards fixing this situation, introduce the ability to store a 'chunk owner' at gen_pool_add() time, and a facility to retrieve the owner at gen_pool_{alloc,free}() time. For p2pdma this will be used to store and recall individual dev_pagemap reference counter instances per-chunk. Link: http://lkml.kernel.org/r/155727338118.292046.13407378933221579644.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: "Jérôme Glisse" Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 55 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index dd0a452373e7..a337313e064f 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -75,6 +75,7 @@ struct gen_pool_chunk { struct list_head next_chunk; /* next chunk in pool */ atomic_long_t avail; phys_addr_t phys_addr; /* physical starting address of memory chunk */ + void *owner; /* private data to retrieve at alloc time */ unsigned long start_addr; /* start address of memory chunk */ unsigned long end_addr; /* end address of memory chunk (inclusive) */ unsigned long bits[0]; /* bitmap for allocating memory chunk */ @@ -96,8 +97,15 @@ struct genpool_data_fixed { extern struct gen_pool *gen_pool_create(int, int); extern phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long); -extern int gen_pool_add_virt(struct gen_pool *, unsigned long, phys_addr_t, - size_t, int); +extern int gen_pool_add_owner(struct gen_pool *, unsigned long, phys_addr_t, + size_t, int, void *); + +static inline int gen_pool_add_virt(struct gen_pool *pool, unsigned long addr, + phys_addr_t phys, size_t size, int nid) +{ + return gen_pool_add_owner(pool, addr, phys, size, nid, NULL); +} + /** * gen_pool_add - add a new chunk of special memory to the pool * @pool: pool to add new memory chunk to @@ -116,12 +124,47 @@ static inline int gen_pool_add(struct gen_pool *pool, unsigned long addr, return gen_pool_add_virt(pool, addr, -1, size, nid); } extern void gen_pool_destroy(struct gen_pool *); -extern unsigned long gen_pool_alloc(struct gen_pool *, size_t); -extern unsigned long gen_pool_alloc_algo(struct gen_pool *, size_t, - genpool_algo_t algo, void *data); +unsigned long gen_pool_alloc_algo_owner(struct gen_pool *pool, size_t size, + genpool_algo_t algo, void *data, void **owner); + +static inline unsigned long gen_pool_alloc_owner(struct gen_pool *pool, + size_t size, void **owner) +{ + return gen_pool_alloc_algo_owner(pool, size, pool->algo, pool->data, + owner); +} + +static inline unsigned long gen_pool_alloc_algo(struct gen_pool *pool, + size_t size, genpool_algo_t algo, void *data) +{ + return gen_pool_alloc_algo_owner(pool, size, algo, data, NULL); +} + +/** + * gen_pool_alloc - allocate special memory from the pool + * @pool: pool to allocate from + * @size: number of bytes to allocate from the pool + * + * Allocate the requested number of bytes from the specified pool. + * Uses the pool allocation function (with first-fit algorithm by default). + * Can not be used in NMI handler on architectures without + * NMI-safe cmpxchg implementation. + */ +static inline unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) +{ + return gen_pool_alloc_algo(pool, size, pool->algo, pool->data); +} + extern void *gen_pool_dma_alloc(struct gen_pool *pool, size_t size, dma_addr_t *dma); -extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); +extern void gen_pool_free_owner(struct gen_pool *pool, unsigned long addr, + size_t size, void **owner); +static inline void gen_pool_free(struct gen_pool *pool, unsigned long addr, + size_t size) +{ + gen_pool_free_owner(pool, addr, size, NULL); +} + extern void gen_pool_for_each_chunk(struct gen_pool *, void (*)(struct gen_pool *, struct gen_pool_chunk *, void *), void *); extern size_t gen_pool_avail(struct gen_pool *); -- cgit v1.2.3 From 50f44ee7248ad2f7984ef081974a6ecd09724b3e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 13 Jun 2019 15:56:33 -0700 Subject: mm/devm_memremap_pages: fix final page put race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Logan noticed that devm_memremap_pages_release() kills the percpu_ref drops all the page references that were acquired at init and then immediately proceeds to unplug, arch_remove_memory(), the backing pages for the pagemap. If for some reason device shutdown actually collides with a busy / elevated-ref-count page then arch_remove_memory() should be deferred until after that reference is dropped. As it stands the "wait for last page ref drop" happens *after* devm_memremap_pages_release() returns, which is obviously too late and can lead to crashes. Fix this situation by assigning the responsibility to wait for the percpu_ref to go idle to devm_memremap_pages() with a new ->cleanup() callback. Implement the new cleanup callback for all devm_memremap_pages() users: pmem, devdax, hmm, and p2pdma. Link: http://lkml.kernel.org/r/155727339156.292046.5432007428235387859.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: 41e94a851304 ("add devm_memremap_pages") Signed-off-by: Dan Williams Reported-by: Logan Gunthorpe Reviewed-by: Ira Weiny Reviewed-by: Logan Gunthorpe Cc: Bjorn Helgaas Cc: "Jérôme Glisse" Cc: Christoph Hellwig Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 7601ee314c4a..1732dea030b2 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -81,6 +81,7 @@ typedef void (*dev_page_free_t)(struct page *page, void *data); * @res: physical address range covered by @ref * @ref: reference count that pins the devm_memremap_pages() mapping * @kill: callback to transition @ref to the dead state + * @cleanup: callback to wait for @ref to be idle and reap it * @dev: host device of the mapping for debug * @data: private data pointer for page_free() * @type: memory type: see MEMORY_* in memory_hotplug.h @@ -92,6 +93,7 @@ struct dev_pagemap { struct resource res; struct percpu_ref *ref; void (*kill)(struct percpu_ref *ref); + void (*cleanup)(struct percpu_ref *ref); struct device *dev; void *data; enum memory_type type; -- cgit v1.2.3 From 78f4e932f7760d965fb1569025d1576ab77557c5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 13 Jun 2019 15:49:02 +0200 Subject: x86/microcode, cpuhotplug: Add a microcode loader CPU hotplug callback Adric Blake reported the following warning during suspend-resume: Enabling non-boot CPUs ... x86: Booting SMP configuration: smpboot: Booting Node 0 Processor 1 APIC 0x2 unchecked MSR access error: WRMSR to 0x10f (tried to write 0x0000000000000000) \ at rIP: 0xffffffff8d267924 (native_write_msr+0x4/0x20) Call Trace: intel_set_tfa intel_pmu_cpu_starting ? x86_pmu_dead_cpu x86_pmu_starting_cpu cpuhp_invoke_callback ? _raw_spin_lock_irqsave notify_cpu_starting start_secondary secondary_startup_64 microcode: sig=0x806ea, pf=0x80, revision=0x96 microcode: updated to revision 0xb4, date = 2019-04-01 CPU1 is up The MSR in question is MSR_TFA_RTM_FORCE_ABORT and that MSR is emulated by microcode. The log above shows that the microcode loader callback happens after the PMU restoration, leading to the conjecture that because the microcode hasn't been updated yet, that MSR is not present yet, leading to the #GP. Add a microcode loader-specific hotplug vector which comes before the PERF vectors and thus executes earlier and makes sure the MSR is present. Fixes: 400816f60c54 ("perf/x86/intel: Implement support for TSX Force Abort") Reported-by: Adric Blake Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Cc: x86@kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=203637 --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6a381594608c..5c6062206760 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -101,6 +101,7 @@ enum cpuhp_state { CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, + CPUHP_AP_MICROCODE_LOADER, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, -- cgit v1.2.3