From b795854b1fa70f6aee923ae5df74ff7afeaddcaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:07 +0100 Subject: sched/numa: Set preferred NUMA node based on number of private faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 89 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 22 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b6e55ee8855..bb412ce2a8b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NID_PGOFF (ZONES_PGOFF - LAST_NID_WIDTH) +#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NID_PGSHIFT (LAST_NID_PGOFF * (LAST_NID_WIDTH != 0)) +#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NID_MASK ((1UL << LAST_NID_WIDTH) - 1) +#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,48 +661,93 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -#ifdef LAST_NID_NOT_IN_PAGE_FLAGS -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int nid_pid_to_nidpid(int nid, int pid) { - return xchg(&page->_last_nid, nid); + return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int page_nid_last(struct page *page) +static inline int nidpid_to_pid(int nidpid) { - return page->_last_nid; + return nidpid & LAST__PID_MASK; } -static inline void page_nid_reset_last(struct page *page) + +static inline int nidpid_to_nid(int nidpid) +{ + return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); +} + +static inline bool nidpid_nid_unset(int nidpid) { - page->_last_nid = -1; + return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); +} + +#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS +static inline int page_nidpid_xchg_last(struct page *page, int nid) +{ + return xchg(&page->_last_nidpid, nid); +} + +static inline int page_nidpid_last(struct page *page) +{ + return page->_last_nidpid; +} +static inline void page_nidpid_reset_last(struct page *page) +{ + page->_last_nidpid = -1; } #else -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { - return (page->flags >> LAST_NID_PGSHIFT) & LAST_NID_MASK; + return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; } -extern int page_nid_xchg_last(struct page *page, int nid); +extern int page_nidpid_xchg_last(struct page *page, int nidpid); -static inline void page_nid_reset_last(struct page *page) +static inline void page_nidpid_reset_last(struct page *page) { - int nid = (1 << LAST_NID_SHIFT) - 1; + int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; - page->flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); - page->flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; + page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); + page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; } -#endif /* LAST_NID_NOT_IN_PAGE_FLAGS */ +#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ #else -static inline int page_nid_xchg_last(struct page *page, int nid) +static inline int page_nidpid_xchg_last(struct page *page, int nidpid) { return page_to_nid(page); } -static inline int page_nid_last(struct page *page) +static inline int page_nidpid_last(struct page *page) { return page_to_nid(page); } -static inline void page_nid_reset_last(struct page *page) +static inline int nidpid_to_nid(int nidpid) +{ + return -1; +} + +static inline int nidpid_to_pid(int nidpid) +{ + return -1; +} + +static inline int nid_pid_to_nidpid(int nid, int pid) +{ + return -1; +} + +static inline bool nidpid_pid_unset(int nidpid) +{ + return 1; +} + +static inline void page_nidpid_reset_last(struct page *page) { } #endif -- cgit v1.2.3 From 90572890d202527c366aa9489b32404e88a7c020 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:20 +0100 Subject: mm: numa: Change page last {nid,pid} into {cpu,pid} Change the per page last fault tracking to use cpu,pid instead of nid,pid. This will allow us to try and lookup the alternate task more easily. Note that even though it is the cpu that is store in the page flags that the mpol_misplaced decision is still based on the node. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-43-git-send-email-mgorman@suse.de [ Fixed build failure on 32-bit systems. ] Signed-off-by: Ingo Molnar --- include/linux/mm.h | 90 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 40 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index bb412ce2a8b5..ce464cd4777e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -581,11 +581,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * sets it, so none of the operations on it need to be atomic. */ -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_NIDPID] | ... | FLAGS | */ +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_NIDPID_PGOFF (ZONES_PGOFF - LAST_NIDPID_WIDTH) +#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) /* * Define the bit shifts to access each section. For non-existent @@ -595,7 +595,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_NIDPID_PGSHIFT (LAST_NIDPID_PGOFF * (LAST_NIDPID_WIDTH != 0)) +#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -617,7 +617,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_NIDPID_MASK ((1UL << LAST_NIDPID_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) @@ -661,96 +661,106 @@ static inline int page_to_nid(const struct page *page) #endif #ifdef CONFIG_NUMA_BALANCING -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpu_pid_to_cpupid(int cpu, int pid) { - return ((nid & LAST__NID_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); + return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { - return nidpid & LAST__PID_MASK; + return cpupid & LAST__PID_MASK; } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_cpu(int cpupid) { - return (nidpid >> LAST__PID_SHIFT) & LAST__NID_MASK; + return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpupid_to_nid(int cpupid) { - return nidpid_to_pid(nidpid) == (-1 & LAST__PID_MASK); + return cpu_to_node(cpupid_to_cpu(cpupid)); } -static inline bool nidpid_nid_unset(int nidpid) +static inline bool cpupid_pid_unset(int cpupid) { - return nidpid_to_nid(nidpid) == (-1 & LAST__NID_MASK); + return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } -#ifdef LAST_NIDPID_NOT_IN_PAGE_FLAGS -static inline int page_nidpid_xchg_last(struct page *page, int nid) +static inline bool cpupid_cpu_unset(int cpupid) { - return xchg(&page->_last_nidpid, nid); + return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } -static inline int page_nidpid_last(struct page *page) +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page->_last_nidpid; + return xchg(&page->_last_cpupid, cpupid); } -static inline void page_nidpid_reset_last(struct page *page) + +static inline int page_cpupid_last(struct page *page) +{ + return page->_last_cpupid; +} +static inline void page_cpupid_reset_last(struct page *page) { - page->_last_nidpid = -1; + page->_last_cpupid = -1; } #else -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return (page->flags >> LAST_NIDPID_PGSHIFT) & LAST_NIDPID_MASK; + return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } -extern int page_nidpid_xchg_last(struct page *page, int nidpid); +extern int page_cpupid_xchg_last(struct page *page, int cpupid); -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { - int nidpid = (1 << LAST_NIDPID_SHIFT) - 1; + int cpupid = (1 << LAST_CPUPID_SHIFT) - 1; - page->flags &= ~(LAST_NIDPID_MASK << LAST_NIDPID_PGSHIFT); - page->flags |= (nidpid & LAST_NIDPID_MASK) << LAST_NIDPID_PGSHIFT; + page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); + page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; } -#endif /* LAST_NIDPID_NOT_IN_PAGE_FLAGS */ -#else -static inline int page_nidpid_xchg_last(struct page *page, int nidpid) +#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ +#else /* !CONFIG_NUMA_BALANCING */ +static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int page_nidpid_last(struct page *page) +static inline int page_cpupid_last(struct page *page) { - return page_to_nid(page); + return page_to_nid(page); /* XXX */ } -static inline int nidpid_to_nid(int nidpid) +static inline int cpupid_to_nid(int cpupid) { return -1; } -static inline int nidpid_to_pid(int nidpid) +static inline int cpupid_to_pid(int cpupid) { return -1; } -static inline int nid_pid_to_nidpid(int nid, int pid) +static inline int cpupid_to_cpu(int cpupid) { return -1; } -static inline bool nidpid_pid_unset(int nidpid) +static inline int cpu_pid_to_cpupid(int nid, int pid) +{ + return -1; +} + +static inline bool cpupid_pid_unset(int cpupid) { return 1; } -static inline void page_nidpid_reset_last(struct page *page) +static inline void page_cpupid_reset_last(struct page *page) { } -#endif +#endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) { -- cgit v1.2.3 From 8c8a743c5087bac9caac8155b8f3b367e75cdd0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:21 +0100 Subject: sched/numa: Use {cpu, pid} to create task groups for shared faults While parallel applications tend to align their data on the cache boundary, they tend not to align on the page or THP boundary. Consequently tasks that partition their data can still "false-share" pages presenting a problem for optimal NUMA placement. This patch uses NUMA hinting faults to chain tasks together into numa_groups. As well as storing the NID a task was running on when accessing a page a truncated representation of the faulting PID is stored. If subsequent faults are from different PIDs it is reasonable to assume that those two tasks share a page and are candidates for being grouped together. Note that this patch makes no scheduling decisions based on the grouping information. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-44-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/mm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index ce464cd4777e..81443d557a2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,12 @@ static inline bool cpupid_cpu_unset(int cpupid) return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } +static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) +{ + return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); +} + +#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { @@ -760,6 +766,11 @@ static inline bool cpupid_pid_unset(int cpupid) static inline void page_cpupid_reset_last(struct page *page) { } + +static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ static inline struct zone *page_zone(const struct page *page) -- cgit v1.2.3