summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c65
-rw-r--r--kernel/sysctl.c7
2 files changed, 59 insertions, 13 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6831abb5dbef..0a349dd1fa60 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -780,10 +780,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_NUMA_BALANCING
/*
- * numa task sample period in ms: 5s
+ * numa task sample period in ms
*/
-unsigned int sysctl_numa_balancing_scan_period_min = 5000;
-unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*16;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
static void task_numa_placement(struct task_struct *p)
{
@@ -808,6 +811,12 @@ void task_numa_fault(int node, int pages)
task_numa_placement(p);
}
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ p->mm->numa_scan_offset = 0;
+}
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -817,6 +826,9 @@ void task_numa_work(struct callback_head *work)
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long offset, end;
+ long length;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -846,18 +858,45 @@ void task_numa_work(struct callback_head *work)
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
- ACCESS_ONCE(mm->numa_scan_seq)++;
- {
- struct vm_area_struct *vma;
+ offset = mm->numa_scan_offset;
+ length = sysctl_numa_balancing_scan_size;
+ length <<= 20;
- down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma))
- continue;
- change_prot_numa(vma, vma->vm_start, vma->vm_end);
- }
- up_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, offset);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ offset = 0;
+ vma = mm->mmap;
+ }
+ for (; vma && length > 0; vma = vma->vm_next) {
+ if (!vma_migratable(vma))
+ continue;
+
+ /* Skip small VMAs. They are not likely to be of relevance */
+ if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
+ continue;
+
+ offset = max(offset, vma->vm_start);
+ end = min(ALIGN(offset + length, HPAGE_SIZE), vma->vm_end);
+ length -= end - offset;
+
+ change_prot_numa(vma, offset, end);
+
+ offset = end;
}
+
+ /*
+ * It is possible to reach the end of the VMA list but the last few VMAs are
+ * not guaranteed to the vma_migratable. If they are not, we would find the
+ * !migratable VMA on the next scan but not reset the scanner to the start
+ * so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = offset;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
}
/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 025e1ae50ef1..7d3a2e0475e5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "numa_balancing_scan_size_mb",
+ .data = &sysctl_numa_balancing_scan_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{