From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:05 -0700 Subject: [PATCH] mm: rss = file_rss + anon_rss I was lazy when we added anon_rss, and chose to change as few places as possible. So currently each anonymous page has to be counted twice, in rss and in anon_rss. Which won't be so good if those are atomic counts in some configurations. Change that around: keep file_rss and anon_rss separately, and add them together (with get_mm_rss macro) when the total is needed - reading two atomics is much cheaper than updating two atomics. And update anon_rss upfront, typically in memory.c, not tucked away in page_add_anon_rmap. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 27519df0f987..afcaac66cbd5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -254,6 +254,8 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define add_mm_counter(mm, member, value) (mm)->_##member += (value) #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- +#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) + typedef unsigned long mm_counter_t; struct mm_struct { @@ -286,7 +288,7 @@ struct mm_struct { unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; /* Special counters protected by the page_table_lock */ - mm_counter_t _rss; + mm_counter_t _file_rss; mm_counter_t _anon_rss; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ -- cgit v1.2.3 From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:18 -0700 Subject: [PATCH] mm: update_hiwaters just in time update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc//status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index afcaac66cbd5..a9c0b7d26303 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define dec_mm_counter(mm, member) (mm)->_##member-- #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) +#define update_hiwater_rss(mm) do { \ + unsigned long _rss = get_mm_rss(mm); \ + if ((mm)->hiwater_rss < _rss) \ + (mm)->hiwater_rss = _rss; \ +} while (0) +#define update_hiwater_vm(mm) do { \ + if ((mm)->hiwater_vm < (mm)->total_vm) \ + (mm)->hiwater_vm = (mm)->total_vm; \ +} while (0) + typedef unsigned long mm_counter_t; struct mm_struct { -- cgit v1.2.3 From f449952bc8bde7fbc73c6d20dff92b627a21f8b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:19 -0700 Subject: [PATCH] mm: mm_struct hiwaters moved Slight and timid rearrangement of mm_struct: hiwater_rss and hiwater_vm were tacked on the end, but it seems better to keep them near _file_rss, _anon_rss and total_vm, in the same cacheline on those arches verified. There are likely to be more profitable rearrangements, but less obvious (is it good or bad that saved_auxv[AT_VECTOR_SIZE] isolates cpu_vm_mask and context from many others?), needing serious instrumentation. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index a9c0b7d26303..292cb57ce38f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -291,16 +291,19 @@ struct mm_struct { * by mmlist_lock */ - unsigned long start_code, end_code, start_data, end_data; - unsigned long start_brk, brk, start_stack; - unsigned long arg_start, arg_end, env_start, env_end; - unsigned long total_vm, locked_vm, shared_vm; - unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; - /* Special counters protected by the page_table_lock */ mm_counter_t _file_rss; mm_counter_t _anon_rss; + unsigned long hiwater_rss; /* High-watermark of RSS usage */ + unsigned long hiwater_vm; /* High-water virtual memory usage */ + + unsigned long total_vm, locked_vm, shared_vm, exec_vm; + unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ unsigned dumpable:2; @@ -320,11 +323,7 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; - struct kioctx default_kioctx; - - unsigned long hiwater_rss; /* High-water RSS usage */ - unsigned long hiwater_vm; /* High-water virtual memory usage */ }; struct sighand_struct { -- cgit v1.2.3 From f412ac08c9861b4791af0145934c22f1458686da Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:41 -0700 Subject: [PATCH] mm: fix rss and mmlist locking A couple of oddities were guarded by page_table_lock, no longer properly guarded when that is split. The mm_counters of file_rss and anon_rss: make those an atomic_t, or an atomic64_t if the architecture supports it, in such a case. Definitions by courtesy of Christoph Lameter: who spent considerable effort on more scalable ways of counting, but found insufficient benefit in practice. And adding an mm with swap to the mmlist for swapoff: the list is well- guarded by its own lock, but the list_empty check now has to be repeated inside it. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 292cb57ce38f..1c30bc308ef1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -249,13 +249,47 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * The mm counters are not protected by its page_table_lock, + * so must be incremented atomically. + */ +#ifdef ATOMIC64_INIT +#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) +typedef atomic64_t mm_counter_t; +#else /* !ATOMIC64_INIT */ +/* + * The counters wrap back to 0 at 2^32 * PAGE_SIZE, + * that is, at 16TB if using 4kB page size. + */ +#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) +typedef atomic_t mm_counter_t; +#endif /* !ATOMIC64_INIT */ + +#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +/* + * The mm counters are protected by its page_table_lock, + * so can be incremented directly. + */ #define set_mm_counter(mm, member, value) (mm)->_##member = (value) #define get_mm_counter(mm, member) ((mm)->_##member) #define add_mm_counter(mm, member, value) (mm)->_##member += (value) #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- -#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) +typedef unsigned long mm_counter_t; + +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +#define get_mm_rss(mm) \ + (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) #define update_hiwater_rss(mm) do { \ unsigned long _rss = get_mm_rss(mm); \ if ((mm)->hiwater_rss < _rss) \ @@ -266,8 +300,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) -typedef unsigned long mm_counter_t; - struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -291,7 +323,9 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters protected by the page_table_lock */ + /* Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ mm_counter_t _file_rss; mm_counter_t _anon_rss; -- cgit v1.2.3