diff options
207 files changed, 2214 insertions, 1420 deletions
diff --git a/.gitignore b/.gitignore index ce57b79670a5..9ac91060ea64 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,11 @@ Module.symvers /debian/ # +# tar directory (make tar*-pkg) +# +/tar-install/ + +# # git files that we don't want to ignore even it they are dot-files # !.gitignore diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index cf8fc2f0b34b..a07ba61662ed 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -145,6 +145,8 @@ Table 1-1: Process specific entries in /proc stack Report full stack trace, enable via CONFIG_STACKTRACE smaps a extension based on maps, showing the memory consumption of each mapping and flags associated with it + numa_maps an extension based on maps, showing the memory locality and + binding policy as well as mem usage (in pages) of each mapping. .............................................................................. For example, to get the status information of a process, all you have to do is @@ -489,12 +491,47 @@ To clear the bits for the file mapped pages associated with the process To clear the soft-dirty bit > echo 4 > /proc/PID/clear_refs +To reset the peak resident set size ("high water mark") to the process's +current value: + > echo 5 > /proc/PID/clear_refs + Any other value written to /proc/PID/clear_refs will have no effect. The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags using /proc/kpageflags and number of times a page is mapped using /proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt. +The /proc/pid/numa_maps is an extension based on maps, showing the memory +locality and binding policy, as well as the memory usage (in pages) of +each mapping. The output follows a general format where mapping details get +summarized separated by blank spaces, one mapping per each file line: + +address policy mapping details + +00400000 default file=/usr/local/bin/app mapped=1 active=0 N3=1 kernelpagesize_kB=4 +00600000 default file=/usr/local/bin/app anon=1 dirty=1 N3=1 kernelpagesize_kB=4 +3206000000 default file=/lib64/ld-2.12.so mapped=26 mapmax=6 N0=24 N3=2 kernelpagesize_kB=4 +320621f000 default file=/lib64/ld-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4 +3206220000 default file=/lib64/ld-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4 +3206221000 default anon=1 dirty=1 N3=1 kernelpagesize_kB=4 +3206800000 default file=/lib64/libc-2.12.so mapped=59 mapmax=21 active=55 N0=41 N3=18 kernelpagesize_kB=4 +320698b000 default file=/lib64/libc-2.12.so +3206b8a000 default file=/lib64/libc-2.12.so anon=2 dirty=2 N3=2 kernelpagesize_kB=4 +3206b8e000 default file=/lib64/libc-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4 +3206b8f000 default anon=3 dirty=3 active=1 N3=3 kernelpagesize_kB=4 +7f4dc10a2000 default anon=3 dirty=3 N3=3 kernelpagesize_kB=4 +7f4dc10b4000 default anon=2 dirty=2 active=1 N3=2 kernelpagesize_kB=4 +7f4dc1200000 default file=/anon_hugepage\040(deleted) huge anon=1 dirty=1 N3=1 kernelpagesize_kB=2048 +7fff335f0000 default stack anon=3 dirty=3 N3=3 kernelpagesize_kB=4 +7fff3369d000 default mapped=1 mapmax=35 active=0 N3=1 kernelpagesize_kB=4 + +Where: +"address" is the starting address for the mapping; +"policy" reports the NUMA memory policy set for the mapping (see vm/numa_memory_policy.txt); +"mapping details" summarizes mapping data such as mapping type, page usage counters, +node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page +size, in KB, that is backing the mapping up. + 1.2 Kernel data --------------- diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 48bbea6898b3..d5b98ab514bb 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -27,8 +27,6 @@ struct thread_info { int bpt_nsaved; unsigned long bpt_addr[2]; /* breakpoint handling */ unsigned int bpt_insn[2]; - - struct restart_block restart_block; }; /* @@ -40,9 +38,6 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 6cec2881acbf..8dbfb15f1745 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -150,7 +150,7 @@ restore_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs) struct switch_stack *sw = (struct switch_stack *)regs - 1; long i, err = __get_user(regs->pc, &sc->sc_pc); - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; sw->r26 = (unsigned long) ret_from_sys_call; diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index 02bc5ec0fb2e..1163a1838ac1 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -46,7 +46,6 @@ struct thread_info { struct exec_domain *exec_domain;/* execution domain */ __u32 cpu; /* current CPU */ unsigned long thr_ptr; /* TLS ptr */ - struct restart_block restart_block; }; /* @@ -62,9 +61,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/arc/kernel/signal.c b/arch/arc/kernel/signal.c index cb3142a2d40b..114234e83caa 100644 --- a/arch/arc/kernel/signal.c +++ b/arch/arc/kernel/signal.c @@ -104,7 +104,7 @@ SYSCALL_DEFINE0(rt_sigreturn) struct pt_regs *regs = current_pt_regs(); /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* Since we stacked the signal on a word boundary, * then 'sp' should be word aligned here. If it's diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 18dbc82f85e5..423a5ac09d3a 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -257,7 +257,10 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) /* represent a notpresent pmd by zero, this is used by pmdp_invalidate */ -#define pmd_mknotpresent(pmd) (__pmd(0)) +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return __pmd(0); +} static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index d890e41f5520..72812a1f3d1c 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -68,7 +68,6 @@ struct thread_info { #ifdef CONFIG_ARM_THUMBEE unsigned long thumbee_state; /* ThumbEE Handler Base register */ #endif - struct restart_block restart_block; }; #define INIT_THREAD_INFO(tsk) \ @@ -81,9 +80,6 @@ struct thread_info { .cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \ domain_val(DOMAIN_IO, DOMAIN_CLIENT), \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 8aa6f1b87c9e..023ac905e4c3 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -191,7 +191,7 @@ asmlinkage int sys_sigreturn(struct pt_regs *regs) struct sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, @@ -221,7 +221,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 459bf8e53208..702e1e6a0d80 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -48,7 +48,6 @@ struct thread_info { mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ struct exec_domain *exec_domain; /* execution domain */ - struct restart_block restart_block; int preempt_count; /* 0 => preemptable, <0 => bug */ int cpu; /* cpu */ }; @@ -60,9 +59,6 @@ struct thread_info { .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 6fa792137eda..660ccf9f7524 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -131,7 +131,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 128-bit boundary, then 'sp' should diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index e299de396e9b..c20a300e2213 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -347,7 +347,7 @@ asmlinkage int compat_sys_sigreturn(struct pt_regs *regs) struct compat_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, @@ -381,7 +381,7 @@ asmlinkage int compat_sys_rt_sigreturn(struct pt_regs *regs) struct compat_rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index a978f3fe7c25..d56afa99a514 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -30,7 +30,6 @@ struct thread_info { saved by debug handler when setting up trampoline */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -41,9 +40,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall \ - } \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c index d6a8193a1d2f..e41c84516e5d 100644 --- a/arch/avr32/kernel/asm-offsets.c +++ b/arch/avr32/kernel/asm-offsets.c @@ -18,7 +18,6 @@ void foo(void) OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_rar_saved, thread_info, rar_saved); OFFSET(TI_rsr_saved, thread_info, rsr_saved); - OFFSET(TI_restart_block, thread_info, restart_block); BLANK(); OFFSET(TSK_active_mm, task_struct, active_mm); BLANK(); diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index d309fbcc3bd6..8f1c63b9b983 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -69,7 +69,7 @@ asmlinkage int sys_rt_sigreturn(struct pt_regs *regs) sigset_t set; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *)regs->sp; pr_debug("SIG return: frame = %p\n", frame); diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index 55f473bdad36..57c3a8bd583d 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -42,7 +42,6 @@ struct thread_info { int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* address limit */ - struct restart_block restart_block; #ifndef CONFIG_SMP struct l1_scratch_task_info l1_task_info; #endif @@ -58,9 +57,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/blackfin/kernel/signal.c b/arch/blackfin/kernel/signal.c index ef275571d885..f2a8b5493bd3 100644 --- a/arch/blackfin/kernel/signal.c +++ b/arch/blackfin/kernel/signal.c @@ -44,7 +44,7 @@ rt_restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *p int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; #define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x) diff --git a/arch/c6x/include/asm/thread_info.h b/arch/c6x/include/asm/thread_info.h index d4e9ef87076d..584e253f3217 100644 --- a/arch/c6x/include/asm/thread_info.h +++ b/arch/c6x/include/asm/thread_info.h @@ -45,7 +45,6 @@ struct thread_info { int cpu; /* cpu we're on */ int preempt_count; /* 0 = preemptable, <0 = BUG */ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; }; /* @@ -61,9 +60,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/c6x/kernel/signal.c b/arch/c6x/kernel/signal.c index fe68226f6c4d..3c4bb5a5c382 100644 --- a/arch/c6x/kernel/signal.c +++ b/arch/c6x/kernel/signal.c @@ -68,7 +68,7 @@ asmlinkage int do_rt_sigreturn(struct pt_regs *regs) sigset_t set; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a dword boundary, diff --git a/arch/cris/arch-v10/kernel/signal.c b/arch/cris/arch-v10/kernel/signal.c index 9b32d338838b..74d7ba35120d 100644 --- a/arch/cris/arch-v10/kernel/signal.c +++ b/arch/cris/arch-v10/kernel/signal.c @@ -67,7 +67,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) unsigned long old_usp; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* restore the regs from &sc->regs (same as sc, since regs is first) * (sc is already checked for VERIFY_READ since the sigframe was diff --git a/arch/cris/arch-v32/kernel/signal.c b/arch/cris/arch-v32/kernel/signal.c index 78ce3b1c9bcb..870e3e069318 100644 --- a/arch/cris/arch-v32/kernel/signal.c +++ b/arch/cris/arch-v32/kernel/signal.c @@ -59,7 +59,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) unsigned long old_usp; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Restore the registers from &sc->regs. sc is already checked diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index 55dede18c032..7286db5ed90e 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -38,7 +38,6 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -56,9 +55,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/frv/include/asm/string.h b/arch/frv/include/asm/string.h index 5ed310f64b7e..1f6c35990439 100644 --- a/arch/frv/include/asm/string.h +++ b/arch/frv/include/asm/string.h @@ -33,7 +33,6 @@ extern void *memcpy(void *, const void *, __kernel_size_t); #define __HAVE_ARCH_STRNCAT 1 #define __HAVE_ARCH_STRCMP 1 #define __HAVE_ARCH_STRNCMP 1 -#define __HAVE_ARCH_STRNICMP 1 #define __HAVE_ARCH_STRCHR 1 #define __HAVE_ARCH_STRRCHR 1 #define __HAVE_ARCH_STRSTR 1 diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h index af29e17c0181..6b917f1c2955 100644 --- a/arch/frv/include/asm/thread_info.h +++ b/arch/frv/include/asm/thread_info.h @@ -41,7 +41,6 @@ struct thread_info { * 0-0xBFFFFFFF for user-thead * 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -65,9 +64,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/frv/kernel/asm-offsets.c b/arch/frv/kernel/asm-offsets.c index 9de96843a278..446e89d500cc 100644 --- a/arch/frv/kernel/asm-offsets.c +++ b/arch/frv/kernel/asm-offsets.c @@ -40,7 +40,6 @@ void foo(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PREEMPT_COUNT, thread_info, preempt_count); OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); BLANK(); /* offsets into register file storage */ diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index dc3d59de0870..336713ab4745 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -62,7 +62,7 @@ static int restore_sigcontext(struct sigcontext __user *sc, int *_gr8) unsigned long tbr, psr; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; tbr = user->i.tbr; psr = user->i.psr; diff --git a/arch/frv/mm/extable.c b/arch/frv/mm/extable.c index 2fb9b3ab57b9..8863d6c1df6e 100644 --- a/arch/frv/mm/extable.c +++ b/arch/frv/mm/extable.c @@ -10,29 +10,6 @@ extern const void __memset_end, __memset_user_error_lr, __memset_user_error_hand extern const void __memcpy_end, __memcpy_user_error_lr, __memcpy_user_error_handler; extern spinlock_t modlist_lock; -/*****************************************************************************/ -/* - * - */ -static inline unsigned long search_one_table(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - while (first <= last) { - const struct exception_table_entry __attribute__((aligned(8))) *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid->fixup; - else if (diff < 0) - first = mid + 1; - else - last = mid - 1; - } - return 0; -} /* end search_one_table() */ /*****************************************************************************/ /* diff --git a/arch/hexagon/include/asm/thread_info.h b/arch/hexagon/include/asm/thread_info.h index a59dad3b3695..bacd3d6030c5 100644 --- a/arch/hexagon/include/asm/thread_info.h +++ b/arch/hexagon/include/asm/thread_info.h @@ -56,7 +56,6 @@ struct thread_info { * used for syscalls somehow; * seems to have a function pointer and four arguments */ - struct restart_block restart_block; /* Points to the current pt_regs frame */ struct pt_regs *regs; /* @@ -83,9 +82,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = 1, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .sp = 0, \ .regs = NULL, \ } diff --git a/arch/hexagon/kernel/signal.c b/arch/hexagon/kernel/signal.c index eadd70e47e7e..b039a624c170 100644 --- a/arch/hexagon/kernel/signal.c +++ b/arch/hexagon/kernel/signal.c @@ -239,7 +239,7 @@ asmlinkage int sys_rt_sigreturn(void) sigset_t blocked; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *)pt_psp(regs); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 5b17418b4223..c16f21a068ff 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -27,7 +27,6 @@ struct thread_info { __u32 status; /* Thread synchronous flags */ mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ - struct restart_block restart_block; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE __u64 ac_stamp; __u64 ac_leave; @@ -46,9 +45,6 @@ struct thread_info { .cpu = 0, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #ifndef ASM_OFFSETS_C diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 6d92170be457..b3a124da71e5 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -46,7 +46,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) long err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* restore scratch that always needs gets updated during signal delivery: */ err = __get_user(flags, &sc->sc_flags); diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index 00171703402f..32422d0211c3 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -34,7 +34,6 @@ struct thread_info { 0-0xBFFFFFFF for user-thread 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -49,7 +48,6 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C #endif @@ -68,9 +66,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 95408b8f130a..7736c6660a15 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -48,7 +48,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; #define COPY(x) err |= __get_user(regs->x, &sc->sc_##x) COPY(r4); diff --git a/arch/m68k/include/asm/thread_info.h b/arch/m68k/include/asm/thread_info.h index 21a4784ca5a1..c54256e69e64 100644 --- a/arch/m68k/include/asm/thread_info.h +++ b/arch/m68k/include/asm/thread_info.h @@ -31,7 +31,6 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ __u32 cpu; /* should always be 0 on m68k */ unsigned long tp_value; /* thread pointer */ - struct restart_block restart_block; }; #endif /* __ASSEMBLY__ */ @@ -41,9 +40,6 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_stack (init_thread_union.stack) diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 967a8b7e1527..d7179281e74a 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -655,7 +655,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *usc, void __u int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* get previous context */ if (copy_from_user(&context, usc, sizeof(context))) @@ -693,7 +693,7 @@ rt_restore_ucontext(struct pt_regs *regs, struct switch_stack *sw, int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err = __get_user(temp, &uc->uc_mcontext.version); if (temp != MCONTEXT_VERSION) diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h index 47711336119e..afb3ca4776d1 100644 --- a/arch/metag/include/asm/thread_info.h +++ b/arch/metag/include/asm/thread_info.h @@ -35,9 +35,8 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; - u8 supervisor_stack[0]; + u8 supervisor_stack[0] __aligned(8); }; #else /* !__ASSEMBLY__ */ @@ -74,9 +73,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/metag/kernel/signal.c b/arch/metag/kernel/signal.c index 0d100d5c1407..ce49d429c74a 100644 --- a/arch/metag/kernel/signal.c +++ b/arch/metag/kernel/signal.c @@ -48,7 +48,7 @@ static int restore_sigcontext(struct pt_regs *regs, int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL, &sc->regs); diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 8c9d36591a03..b699fbd7de4a 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -71,7 +71,6 @@ struct thread_info { __u32 cpu; /* current CPU */ __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; struct cpu_context cpu_context; }; @@ -87,9 +86,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 235706055b7f..a1cbaf90e2ea 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -89,7 +89,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) int rval; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index e4440f92b366..9e1295f874f0 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -34,7 +34,6 @@ struct thread_info { * 0x7fffffff for user-thead * 0xffffffff for kernel-thread */ - struct restart_block restart_block; struct pt_regs *regs; long syscall; /* syscall number */ }; @@ -50,9 +49,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index b1d84bd4efb3..3b2dfdb4865f 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -98,7 +98,6 @@ void output_thread_info_defines(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); OFFSET(TI_REGS, thread_info, regs); DEFINE(_THREAD_SIZE, THREAD_SIZE); DEFINE(_THREAD_MASK, THREAD_MASK); diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 545bf11bd2ed..6a28c792d862 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -243,7 +243,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) int i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err |= __get_user(regs->cp0_epc, &sc->sc_pc); diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index d69179c0d49d..19a7705f2a01 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -220,7 +220,7 @@ static int restore_sigcontext32(struct pt_regs *regs, int i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err |= __get_user(regs->cp0_epc, &sc->sc_pc); err |= __get_user(regs->hi, &sc->sc_mdhi); diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index bf280eaccd36..c1c374f0ec12 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -50,7 +50,6 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; }; @@ -80,9 +79,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/mn10300/kernel/asm-offsets.c b/arch/mn10300/kernel/asm-offsets.c index 47b3bb0c04ff..d780670cbaf3 100644 --- a/arch/mn10300/kernel/asm-offsets.c +++ b/arch/mn10300/kernel/asm-offsets.c @@ -28,7 +28,6 @@ void foo(void) OFFSET(TI_cpu, thread_info, cpu); OFFSET(TI_preempt_count, thread_info, preempt_count); OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_restart_block, thread_info, restart_block); BLANK(); OFFSET(REG_D0, pt_regs, d0); diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index a6c0858592c3..8609845f12c5 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -40,7 +40,7 @@ static int restore_sigcontext(struct pt_regs *regs, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (is_using_fpu(current)) fpu_kill_state(current); diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index d797acc901e4..875f0845a707 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -57,7 +57,6 @@ struct thread_info { 0-0x7FFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; __u8 supervisor_stack[0]; /* saved context data */ @@ -79,9 +78,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = 1, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .ksp = 0, \ } diff --git a/arch/openrisc/kernel/signal.c b/arch/openrisc/kernel/signal.c index 7d1b8235bf90..4112175bf803 100644 --- a/arch/openrisc/kernel/signal.c +++ b/arch/openrisc/kernel/signal.c @@ -46,7 +46,7 @@ static int restore_sigcontext(struct pt_regs *regs, int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Restore the regs from &sc->regs. diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index a84611835549..fb13e3865563 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -14,7 +14,6 @@ struct thread_info { mm_segment_t addr_limit; /* user-level address space limit */ __u32 cpu; /* current CPU */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ - struct restart_block restart_block; }; #define INIT_THREAD_INFO(tsk) \ @@ -25,9 +24,6 @@ struct thread_info { .cpu = 0, \ .addr_limit = KERNEL_DS, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall \ - } \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 012d4fa63d97..9b910a0251b8 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -99,7 +99,7 @@ sys_rt_sigreturn(struct pt_regs *regs, int in_syscall) sigframe_size = PARISC_RT_SIGFRAME_SIZE32; #endif - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* Unwind the user stack to get the rt_sigframe structure. */ frame = (struct rt_sigframe __user *) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 7e77f2ca5132..79fee2eb8d56 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -40,63 +40,27 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } #ifdef CONFIG_NUMA_BALANCING -static inline int pte_present(pte_t pte) -{ - return pte_val(pte) & _PAGE_NUMA_MASK; -} - -#define pte_present_nonuma pte_present_nonuma -static inline int pte_present_nonuma(pte_t pte) -{ - return pte_val(pte) & (_PAGE_PRESENT); -} - -#define ptep_set_numa ptep_set_numa -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - if ((pte_val(*ptep) & _PAGE_PRESENT) == 0) - VM_BUG_ON(1); - - pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0); - return; -} - -#define pmdp_set_numa pmdp_set_numa -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0) - VM_BUG_ON(1); - - pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA); - return; -} - /* - * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist - * which was inherited from x86. For the purposes of powerpc pte_basic_t and - * pmd_t are equivalent + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h . On powerpc, this will only + * work for user pages and always return true for kernel pages. */ -#define pteval_t pte_basic_t -#define pmdval_t pmd_t -static inline pteval_t ptenuma_flags(pte_t pte) +static inline int pte_protnone(pte_t pte) { - return pte_val(pte) & _PAGE_NUMA_MASK; + return (pte_val(pte) & + (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; } -static inline pmdval_t pmdnuma_flags(pmd_t pmd) +static inline int pmd_protnone(pmd_t pmd) { - return pmd_val(pmd) & _PAGE_NUMA_MASK; + return pte_protnone(pmd_pte(pmd)); } - -# else +#endif /* CONFIG_NUMA_BALANCING */ static inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_PRESENT; } -#endif /* CONFIG_NUMA_BALANCING */ /* Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 2aef9b7a0eb2..c5a755ef7011 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -104,11 +104,6 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \ _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) -#ifdef CONFIG_NUMA_BALANCING -/* Mask of bits that distinguish present and numa ptes */ -#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT) -#endif - /* * We define 2 sets of base prot bits, one for basic pages (ie, * cacheable kernel and user pages) and one for non cacheable diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index 2505d8eab15c..55aea0caf95e 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -27,12 +27,6 @@ #define _PAGE_RW 0x0200 /* software: user write access allowed */ #define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ -/* - * Used for tracking numa faults - */ -#define _PAGE_NUMA 0x00000010 /* Gather numa placement stats */ - - /* No separate kernel read-only */ #define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index e8abc83e699f..72489799cf02 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -43,7 +43,6 @@ struct thread_info { int cpu; /* cpu we're on */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - struct restart_block restart_block; unsigned long local_flags; /* private flags for thread */ /* low level flags - has atomic operations done on it */ @@ -59,9 +58,6 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .flags = 0, \ } diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index b171001698ff..d3a831ac0f92 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1231,7 +1231,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, int tm_restore = 0; #endif /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; rt_sf = (struct rt_sigframe __user *) (regs->gpr[1] + __SIGNAL_FRAMESIZE + 16); @@ -1504,7 +1504,7 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, #endif /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); sc = &sf->sctx; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 2cb0c94cafa5..c7c24d2e2bdb 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -666,7 +666,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, #endif /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, uc, sizeof(*uc))) goto badframe; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index fa7c4f12104f..7316dd15278a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -621,6 +621,38 @@ unsigned long long sched_clock(void) return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; } + +#ifdef CONFIG_PPC_PSERIES + +/* + * Running clock - attempts to give a view of time passing for a virtualised + * kernels. + * Uses the VTB register if available otherwise a next best guess. + */ +unsigned long long running_clock(void) +{ + /* + * Don't read the VTB as a host since KVM does not switch in host + * timebase into the VTB when it takes a guest off the CPU, reading the + * VTB would result in reading 'last switched out' guest VTB. + * + * Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it + * would be unsafe to rely only on the #ifdef above. + */ + if (firmware_has_feature(FW_FEATURE_LPAR) && + cpu_has_feature(CPU_FTR_ARCH_207S)) + return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; + + /* + * This is a next best approximation without a VTB. + * On a host which is running bare metal there should never be any stolen + * time and on a host which doesn't do any virtualisation TB *should* equal + * VTB so it makes no difference anyway. + */ + return local_clock() - cputime_to_nsecs(kcpustat_this_cpu->cpustat[CPUTIME_STEAL]); +} +#endif + static int __init get_freq(char *name, int cells, unsigned long *val) { struct device_node *cpu; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 510bdfbc4073..625407e4d3b0 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -212,7 +212,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); - if (pte_present(pte) && !pte_numa(pte)) { + if (pte_present(pte) && !pte_protnone(pte)) { if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 1b5305d4bdab..f031a47d7701 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -64,10 +64,14 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, if (!(vma->vm_flags & VM_WRITE)) goto out_unlock; } else { - if (dsisr & DSISR_PROTFAULT) - goto out_unlock; if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto out_unlock; + /* + * protfault should only happen due to us + * mapping a region readonly temporarily. PROT_NONE + * is also covered by the VMA check above. + */ + WARN_ON_ONCE(dsisr & DSISR_PROTFAULT); } ret = 0; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 6154b0a2b063..b396868d2aa7 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -389,19 +389,6 @@ good_area: #endif /* CONFIG_8xx */ if (is_exec) { -#ifdef CONFIG_PPC_STD_MMU - /* Protection fault on exec go straight to failure on - * Hash based MMUs as they either don't support per-page - * execute permission, or if they do, it's handled already - * at the hash level. This test would probably have to - * be removed if we change the way this works to make hash - * processors use the same I/D cache coherency mechanism - * as embedded. - */ - if (error_code & DSISR_PROTFAULT) - goto bad_area; -#endif /* CONFIG_PPC_STD_MMU */ - /* * Allow execution from readable areas if the MMU does not * provide separate controls over reading and executing. @@ -416,6 +403,14 @@ good_area: (cpu_has_feature(CPU_FTR_NOEXECUTE) || !(vma->vm_flags & (VM_READ | VM_WRITE)))) goto bad_area; +#ifdef CONFIG_PPC_STD_MMU + /* + * protfault should only happen due to us + * mapping a region readonly temporarily. PROT_NONE + * is also covered by the VMA check above. + */ + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); +#endif /* CONFIG_PPC_STD_MMU */ /* a write */ } else if (is_write) { if (!(vma->vm_flags & VM_WRITE)) @@ -423,11 +418,9 @@ good_area: flags |= FAULT_FLAG_WRITE; /* a read */ } else { - /* protection fault */ - if (error_code & 0x08000000) - goto bad_area; if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); } /* diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index c90e602677c9..83dfcb55ffef 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -172,9 +172,14 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { -#ifdef CONFIG_DEBUG_VM - WARN_ON(pte_val(*ptep) & _PAGE_PRESENT); -#endif + /* + * When handling numa faults, we already have the pte marked + * _PAGE_PRESENT, but we can be sure that it is not in hpte. + * Hence we can use set_pte_at for them. + */ + VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER)); + /* Note: mm->context.id might not yet have been assigned as * this context might not have been activated yet when this * is called. diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 4fe5f64cc179..91bb8836825a 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -718,7 +718,8 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT); + WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER)); assert_spin_locked(&mm->page_table_lock); WARN_ON(!pmd_trans_huge(pmd)); #endif diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 7e2dcd7c57ef..8662f5c8e17f 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -44,7 +44,6 @@ extern char *strstr(const char *, const char *); #undef __HAVE_ARCH_STRCHR #undef __HAVE_ARCH_STRNCHR #undef __HAVE_ARCH_STRNCMP -#undef __HAVE_ARCH_STRNICMP #undef __HAVE_ARCH_STRPBRK #undef __HAVE_ARCH_STRSEP #undef __HAVE_ARCH_STRSPN diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 4d62fd5b56e5..ef1df718642d 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -39,7 +39,6 @@ struct thread_info { unsigned long sys_call_table; /* System call table address */ unsigned int cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ - struct restart_block restart_block; unsigned int system_call; __u64 user_timer; __u64 system_timer; @@ -56,9 +55,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 34d5fa7b01b5..bc1df12dd4f8 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -209,7 +209,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) int i; /* Alwys make any pending restarted system call return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) return -EFAULT; diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 6a2ac257d98f..b3ae6f70c6d6 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -162,7 +162,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) _sigregs user_sregs; /* Alwys make any pending restarted system call return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) return -EFAULT; diff --git a/arch/score/include/asm/thread_info.h b/arch/score/include/asm/thread_info.h index 656b7ada9326..33864fa2a8d4 100644 --- a/arch/score/include/asm/thread_info.h +++ b/arch/score/include/asm/thread_info.h @@ -42,7 +42,6 @@ struct thread_info { * 0-0xFFFFFFFF for kernel-thread */ mm_segment_t addr_limit; - struct restart_block restart_block; struct pt_regs *regs; }; @@ -58,9 +57,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = 1, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/score/kernel/asm-offsets.c b/arch/score/kernel/asm-offsets.c index 57788f44c6fb..b4d5214a7a7e 100644 --- a/arch/score/kernel/asm-offsets.c +++ b/arch/score/kernel/asm-offsets.c @@ -106,7 +106,6 @@ void output_thread_info_defines(void) OFFSET(TI_CPU, thread_info, cpu); OFFSET(TI_PRE_COUNT, thread_info, preempt_count); OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); OFFSET(TI_REGS, thread_info, regs); DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE); DEFINE(KERNEL_STACK_MASK, THREAD_MASK); diff --git a/arch/score/kernel/signal.c b/arch/score/kernel/signal.c index 1651807774ad..e381c8c4ff65 100644 --- a/arch/score/kernel/signal.c +++ b/arch/score/kernel/signal.c @@ -141,7 +141,7 @@ score_rt_sigreturn(struct pt_regs *regs) int sig; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; frame = (struct rt_sigframe __user *) regs->regs[0]; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index ad27ffa65e2e..657c03919627 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -33,7 +33,6 @@ struct thread_info { __u32 cpu; int preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; unsigned long previous_sp; /* sp of previous stack in case of nested IRQ stacks */ __u8 supervisor_stack[0]; @@ -63,9 +62,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/sh/kernel/asm-offsets.c b/arch/sh/kernel/asm-offsets.c index 08a2be775b6c..542225fedb11 100644 --- a/arch/sh/kernel/asm-offsets.c +++ b/arch/sh/kernel/asm-offsets.c @@ -25,7 +25,6 @@ int main(void) DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block)); DEFINE(TI_SIZE, sizeof(struct thread_info)); #ifdef CONFIG_HIBERNATION diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index 2f002b24fb92..0b34f2a704fe 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -156,7 +156,7 @@ asmlinkage int sys_sigreturn(void) int r0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -186,7 +186,7 @@ asmlinkage int sys_rt_sigreturn(void) int r0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 897abe7b871e..71993c6a7d94 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -260,7 +260,7 @@ asmlinkage int sys_sigreturn(unsigned long r2, unsigned long r3, long long ret; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -294,7 +294,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long r2, unsigned long r3, long long ret; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 025c98446b1e..fd7bd0a440ca 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -47,8 +47,6 @@ struct thread_info { struct reg_window32 reg_window[NSWINS]; /* align for ldd! */ unsigned long rwbuf_stkptrs[NSWINS]; unsigned long w_saved; - - struct restart_block restart_block; }; /* @@ -62,9 +60,6 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) @@ -103,7 +98,6 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TI_REG_WINDOW 0x30 #define TI_RWIN_SPTRS 0x230 #define TI_W_SAVED 0x250 -/* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */ /* * thread information flag bit numbers diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 798f0279a4b5..ff455164732a 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -58,8 +58,6 @@ struct thread_info { unsigned long gsr[7]; unsigned long xfsr[7]; - struct restart_block restart_block; - struct pt_regs *kern_una_regs; unsigned int kern_una_insn; @@ -92,10 +90,9 @@ struct thread_info { #define TI_RWIN_SPTRS 0x000003c8 #define TI_GSR 0x00000400 #define TI_XFSR 0x00000438 -#define TI_RESTART_BLOCK 0x00000470 -#define TI_KUNA_REGS 0x000004a0 -#define TI_KUNA_INSN 0x000004a8 -#define TI_FPREGS 0x000004c0 +#define TI_KUNA_REGS 0x00000470 +#define TI_KUNA_INSN 0x00000478 +#define TI_FPREGS 0x00000480 /* We embed this in the uppermost byte of thread_info->flags */ #define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */ @@ -124,9 +121,6 @@ struct thread_info { .current_ds = ASI_P, \ .exec_domain = &default_exec_domain, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 62deba7be1a9..4eed773a7735 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -150,7 +150,7 @@ void do_sigreturn32(struct pt_regs *regs) int err, i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack(); @@ -235,7 +235,7 @@ asmlinkage void do_rt_sigreturn32(struct pt_regs *regs) int err, i; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack(); regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL; diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 9ee72fc8e0e4..52aa5e4ce5e7 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -70,7 +70,7 @@ asmlinkage void do_sigreturn(struct pt_regs *regs) int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack(); diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 1a6999868031..d88beff47bab 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -254,7 +254,7 @@ void do_rt_sigreturn(struct pt_regs *regs) int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; synchronize_user_stack (); sf = (struct rt_signal_frame __user *) diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 981a769b9558..a27651e866e7 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2730,8 +2730,6 @@ void __init trap_init(void) TI_NEW_CHILD != offsetof(struct thread_info, new_child) || TI_CURRENT_DS != offsetof(struct thread_info, current_ds) || - TI_RESTART_BLOCK != offsetof(struct thread_info, - restart_block) || TI_KUNA_REGS != offsetof(struct thread_info, kern_una_regs) || TI_KUNA_INSN != offsetof(struct thread_info, diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 48e4fd0f38e4..96c14c1430d8 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -36,7 +36,6 @@ struct thread_info { mm_segment_t addr_limit; /* thread address space (KERNEL_DS or USER_DS) */ - struct restart_block restart_block; struct single_step_state *step_state; /* single step state (if non-zero) */ int align_ctl; /* controls unaligned access */ @@ -57,9 +56,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .step_state = NULL, \ .align_ctl = 0, \ } diff --git a/arch/tile/kernel/signal.c b/arch/tile/kernel/signal.c index bb0a9ce7ae23..8a524e332c1a 100644 --- a/arch/tile/kernel/signal.c +++ b/arch/tile/kernel/signal.c @@ -48,7 +48,7 @@ int restore_sigcontext(struct pt_regs *regs, int err; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Enforce that sigcontext is like pt_regs, and doesn't mess diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 1c5b2a83046a..e04114c4fcd9 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -22,7 +22,6 @@ struct thread_info { mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user 0-0xFFFFFFFF for kernel */ - struct restart_block restart_block; struct thread_info *real_thread; /* Points to non-IRQ stack */ }; @@ -34,9 +33,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ .real_thread = NULL, \ } diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h index af36d8eabdf1..63e2839dfeb8 100644 --- a/arch/unicore32/include/asm/thread_info.h +++ b/arch/unicore32/include/asm/thread_info.h @@ -79,7 +79,6 @@ struct thread_info { #ifdef CONFIG_UNICORE_FPU_F64 struct fp_state fpstate __attribute__((aligned(8))); #endif - struct restart_block restart_block; }; #define INIT_THREAD_INFO(tsk) \ @@ -89,9 +88,6 @@ struct thread_info { .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c index 7c8fb7018dc6..d329f85766cc 100644 --- a/arch/unicore32/kernel/signal.c +++ b/arch/unicore32/kernel/signal.c @@ -105,7 +105,7 @@ asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs) struct rt_sigframe __user *frame; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; /* * Since we stacked the signal on a 64-bit boundary, diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f9e181aaba97..d0165c9a2932 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -169,7 +169,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, u32 tmp; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { /* diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0fe03f834fb1..67fc3d2b0aab 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -132,13 +132,7 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - /* - * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h. - * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 == - * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL. - */ - return (pte_flags(pte) & _PAGE_SPECIAL) && - (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); + return pte_flags(pte) & _PAGE_SPECIAL; } static inline unsigned long pte_pfn(pte_t pte) @@ -300,7 +294,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) static inline pmd_t pmd_mknotpresent(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_PRESENT); + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -443,13 +437,6 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); -} - -#define pte_present_nonuma pte_present_nonuma -static inline int pte_present_nonuma(pte_t a) -{ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } @@ -459,7 +446,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) if (pte_flags(a) & _PAGE_PRESENT) return true; - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + if ((pte_flags(a) & _PAGE_PROTNONE) && mm_tlb_flush_pending(mm)) return true; @@ -479,10 +466,25 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | - _PAGE_NUMA); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } +#ifdef CONFIG_NUMA_BALANCING +/* + * These work without NUMA balancing but the kernel does not care. See the + * comment in include/asm-generic/pgtable.h + */ +static inline int pte_protnone(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PROTNONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_PROTNONE; +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be @@ -539,11 +541,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { -#ifdef CONFIG_NUMA_BALANCING - /* pmd_numa check */ - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) - return 0; -#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } @@ -862,19 +859,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } #endif diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e227970f983e..2ee781114d34 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* Encode and de-code a swap entry */ #define SWP_TYPE_BITS 5 -#ifdef CONFIG_NUMA_BALANCING -/* Automatic NUMA balancing needs to be distinguishable from swap entries */ -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) -#else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 3e0230c94cff..8c7c10802e9c 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -27,14 +27,6 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ -/* - * Swap offsets on configurations that allow automatic NUMA balancing use the - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the - * maximum possible swap space from 16TB to 8TB. - */ -#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) - /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -76,21 +68,6 @@ #endif /* - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page - * that is not present. The hinting fault gathers numa placement statistics - * (see pte_numa()). The bit is always zero when the PTE is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - */ -#ifdef CONFIG_NUMA_BALANCING -#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) -#else -#define _PAGE_NUMA (_AT(pteval_t, 0)) -#endif - -/* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 6 and 7 are *not* involved @@ -122,8 +99,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_NUMA) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) + _PAGE_SOFT_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* * The cache modes defined here are used to translate between pure SW usage @@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } -#ifdef CONFIG_NUMA_BALANCING -/* Set of bits that distinguishes present, prot_none and numa ptes */ -#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) -static inline pteval_t ptenuma_flags(pte_t pte) -{ - return pte_flags(pte) & _PAGE_NUMA_MASK; -} - -static inline pmdval_t pmdnuma_flags(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_NUMA_MASK; -} -#endif /* CONFIG_NUMA_BALANCING */ - #define pgprot_val(x) ((x).pgprot) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e82e95abc92b..1d4e4f279a32 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -31,7 +31,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int saved_preempt_count; mm_segment_t addr_limit; - struct restart_block restart_block; void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ @@ -45,9 +44,6 @@ struct thread_info { .cpu = 0, \ .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 319bcb9372fe..3acbff4716b0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -168,7 +168,7 @@ static void _hpet_print_config(const char *function, int line) #define hpet_print_config() \ do { \ if (hpet_verbose) \ - _hpet_print_config(__FUNCTION__, __LINE__); \ + _hpet_print_config(__func__, __LINE__); \ } while (0) /* diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index fe3dbfe0c4a5..cd9685235df9 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -49,11 +49,11 @@ int mach_set_rtc_mmss(const struct timespec *now) retval = set_rtc_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", - __FUNCTION__, retval); + __func__, retval); } else { printk(KERN_ERR "%s: Invalid RTC value: write of %lx to RTC failed\n", - __FUNCTION__, nowtime); + __func__, nowtime); retval = -EINVAL; } return retval; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 2a33c8f68319..e5042463c1bc 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -69,7 +69,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; get_user_try { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 89df70e0caa6..81bf3d2af3eb 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_numa(pte)) { + if (pte_protnone(pte)) { pte_unmap(ptep); return 0; } @@ -178,7 +178,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; diff --git a/arch/x86/platform/intel-mid/intel_mid_vrtc.c b/arch/x86/platform/intel-mid/intel_mid_vrtc.c index 4762cff7facd..32947ba0f62d 100644 --- a/arch/x86/platform/intel-mid/intel_mid_vrtc.c +++ b/arch/x86/platform/intel-mid/intel_mid_vrtc.c @@ -110,7 +110,7 @@ int vrtc_set_mmss(const struct timespec *now) spin_unlock_irqrestore(&rtc_lock, flags); } else { pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n", - __FUNCTION__, now->tv_sec); + __func__, now->tv_sec); retval = -EINVAL; } return retval; diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index 79d824551c1a..0c8c32bfd792 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -157,7 +157,7 @@ static int copy_sc_from_user(struct pt_regs *regs, int err, pid; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; err = copy_from_user(&sc, from, sizeof(sc)); if (err) diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 470153e8547c..a9b5d3ba196c 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -51,7 +51,6 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ mm_segment_t addr_limit; /* thread address space */ - struct restart_block restart_block; unsigned long cpenable; @@ -72,7 +71,6 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C #endif @@ -90,9 +88,6 @@ struct thread_info { .cpu = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ } #define init_thread_info (init_thread_union.thread_info) diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 4612321c73cc..3d733ba16f28 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -245,7 +245,7 @@ asmlinkage long xtensa_rt_sigreturn(long a0, long a1, long a2, long a3, int ret; /* Always make any pending restarted system calls return -EINTR */ - current_thread_info()->restart_block.fn = do_no_restart_syscall; + current->restart_block.fn = do_no_restart_syscall; if (regs->depc > 64) panic("rt_sigreturn in double exception!\n"); diff --git a/drivers/acpi/acpica/utdebug.c b/drivers/acpi/acpica/utdebug.c index 57078e3ea9b7..4f3f888d33bb 100644 --- a/drivers/acpi/acpica/utdebug.c +++ b/drivers/acpi/acpica/utdebug.c @@ -111,8 +111,8 @@ void acpi_ut_track_stack_ptr(void) * RETURN: Updated pointer to the function name * * DESCRIPTION: Remove the "Acpi" prefix from the function name, if present. - * This allows compiler macros such as __FUNCTION__ to be used - * with no change to the debug output. + * This allows compiler macros such as __func__ to be used with no + * change to the debug output. * ******************************************************************************/ diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 7f66d2e08f19..37779e4c4585 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1391,7 +1391,7 @@ static int blkfront_probe(struct xenbus_device *dev, if (major != XENVBD_MAJOR) { printk(KERN_INFO "%s: HVM does not support vbd %d as xen block device\n", - __FUNCTION__, vdevice); + __func__, vdevice); return -ENODEV; } } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bd8bda386e02..8e233edd7a09 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -53,9 +53,9 @@ static ssize_t name##_show(struct device *d, \ } \ static DEVICE_ATTR_RO(name); -static inline int init_done(struct zram *zram) +static inline bool init_done(struct zram *zram) { - return zram->meta != NULL; + return zram->disksize; } static inline struct zram *dev_to_zram(struct device *dev) @@ -307,42 +307,67 @@ static inline int valid_io_request(struct zram *zram, return 1; } -static void zram_meta_free(struct zram_meta *meta) +static void zram_meta_free(struct zram_meta *meta, u64 disksize) { + size_t num_pages = disksize >> PAGE_SHIFT; + size_t index; + + /* Free all pages that are still in this zram device */ + for (index = 0; index < num_pages; index++) { + unsigned long handle = meta->table[index].handle; + + if (!handle) + continue; + + zs_free(meta->mem_pool, handle); + } + zs_destroy_pool(meta->mem_pool); vfree(meta->table); kfree(meta); } -static struct zram_meta *zram_meta_alloc(u64 disksize) +static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) { size_t num_pages; + char pool_name[8]; struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); + if (!meta) - goto out; + return NULL; num_pages = disksize >> PAGE_SHIFT; meta->table = vzalloc(num_pages * sizeof(*meta->table)); if (!meta->table) { pr_err("Error allocating zram address table\n"); - goto free_meta; + goto out_error; } - meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); + snprintf(pool_name, sizeof(pool_name), "zram%d", device_id); + meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); - goto free_table; + goto out_error; } return meta; -free_table: +out_error: vfree(meta->table); -free_meta: kfree(meta); - meta = NULL; -out: - return meta; + return NULL; +} + +static inline bool zram_meta_get(struct zram *zram) +{ + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; +} + +static inline void zram_meta_put(struct zram *zram) +{ + atomic_dec(&zram->refcount); } static void update_position(u32 *index, int *offset, struct bio_vec *bvec) @@ -704,10 +729,11 @@ static void zram_bio_discard(struct zram *zram, u32 index, } } -static void zram_reset_device(struct zram *zram, bool reset_capacity) +static void zram_reset_device(struct zram *zram) { - size_t index; struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; down_write(&zram->init_lock); @@ -719,36 +745,30 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) } meta = zram->meta; - /* Free all pages that are still in this zram device */ - for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { - unsigned long handle = meta->table[index].handle; - if (!handle) - continue; - - zs_free(meta->mem_pool, handle); - } - - zcomp_destroy(zram->comp); - zram->max_comp_streams = 1; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); - zram_meta_free(zram->meta); - zram->meta = NULL; /* Reset stats */ memset(&zram->stats, 0, sizeof(zram->stats)); - zram->disksize = 0; - if (reset_capacity) - set_capacity(zram->disk, 0); + zram->max_comp_streams = 1; + set_capacity(zram->disk, 0); up_write(&zram->init_lock); - - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ - if (reset_capacity) - revalidate_disk(zram->disk); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); } static ssize_t disksize_store(struct device *dev, @@ -765,7 +785,7 @@ static ssize_t disksize_store(struct device *dev, return -EINVAL; disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(disksize); + meta = zram_meta_alloc(zram->disk->first_minor, disksize); if (!meta) return -ENOMEM; @@ -784,6 +804,8 @@ static ssize_t disksize_store(struct device *dev, goto out_destroy_comp; } + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); zram->meta = meta; zram->comp = comp; zram->disksize = disksize; @@ -803,7 +825,7 @@ out_destroy_comp: up_write(&zram->init_lock); zcomp_destroy(comp); out_free_meta: - zram_meta_free(meta); + zram_meta_free(meta, disksize); return err; } @@ -821,8 +843,9 @@ static ssize_t reset_store(struct device *dev, if (!bdev) return -ENOMEM; + mutex_lock(&bdev->bd_mutex); /* Do not reset an active device! */ - if (bdev->bd_holders) { + if (bdev->bd_openers) { ret = -EBUSY; goto out; } @@ -838,12 +861,16 @@ static ssize_t reset_store(struct device *dev, /* Make sure all pending I/O is finished */ fsync_bdev(bdev); + zram_reset_device(zram); + + mutex_unlock(&bdev->bd_mutex); + revalidate_disk(zram->disk); bdput(bdev); - zram_reset_device(zram, true); return len; out: + mutex_unlock(&bdev->bd_mutex); bdput(bdev); return ret; } @@ -909,23 +936,21 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; - down_read(&zram->init_lock); - if (unlikely(!init_done(zram))) + if (unlikely(!zram_meta_get(zram))) goto error; if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); - goto error; + goto put_zram; } __zram_make_request(zram, bio); - up_read(&zram->init_lock); - + zram_meta_put(zram); return; - +put_zram: + zram_meta_put(zram); error: - up_read(&zram->init_lock); bio_io_error(bio); } @@ -947,21 +972,19 @@ static void zram_slot_free_notify(struct block_device *bdev, static int zram_rw_page(struct block_device *bdev, sector_t sector, struct page *page, int rw) { - int offset, err; + int offset, err = -EIO; u32 index; struct zram *zram; struct bio_vec bv; zram = bdev->bd_disk->private_data; + if (unlikely(!zram_meta_get(zram))) + goto out; + if (!valid_io_request(zram, sector, PAGE_SIZE)) { atomic64_inc(&zram->stats.invalid_io); - return -EINVAL; - } - - down_read(&zram->init_lock); - if (unlikely(!init_done(zram))) { - err = -EIO; - goto out_unlock; + err = -EINVAL; + goto put_zram; } index = sector >> SECTORS_PER_PAGE_SHIFT; @@ -972,8 +995,9 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_offset = 0; err = zram_bvec_rw(zram, &bv, index, offset, rw); -out_unlock: - up_read(&zram->init_lock); +put_zram: + zram_meta_put(zram); +out: /* * If I/O fails, just return error(ie, non-zero) without * calling page_endio. @@ -1039,19 +1063,19 @@ static struct attribute_group zram_disk_attr_group = { static int create_device(struct zram *zram, int device_id) { + struct request_queue *queue; int ret = -ENOMEM; init_rwsem(&zram->init_lock); - zram->queue = blk_alloc_queue(GFP_KERNEL); - if (!zram->queue) { + queue = blk_alloc_queue(GFP_KERNEL); + if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); goto out; } - blk_queue_make_request(zram->queue, zram_make_request); - zram->queue->queuedata = zram; + blk_queue_make_request(queue, zram_make_request); /* gendisk structure */ zram->disk = alloc_disk(1); @@ -1064,7 +1088,8 @@ static int create_device(struct zram *zram, int device_id) zram->disk->major = zram_major; zram->disk->first_minor = device_id; zram->disk->fops = &zram_devops; - zram->disk->queue = zram->queue; + zram->disk->queue = queue; + zram->disk->queue->queuedata = zram; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); @@ -1115,20 +1140,35 @@ out_free_disk: del_gendisk(zram->disk); put_disk(zram->disk); out_free_queue: - blk_cleanup_queue(zram->queue); + blk_cleanup_queue(queue); out: return ret; } -static void destroy_device(struct zram *zram) +static void destroy_devices(unsigned int nr) { - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); + struct zram *zram; + unsigned int i; - del_gendisk(zram->disk); - put_disk(zram->disk); + for (i = 0; i < nr; i++) { + zram = &zram_devices[i]; + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + zram_reset_device(zram); - blk_cleanup_queue(zram->queue); + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + } + + kfree(zram_devices); + unregister_blkdev(zram_major, "zram"); + pr_info("Destroyed %u device(s)\n", nr); } static int __init zram_init(void) @@ -1138,64 +1178,39 @@ static int __init zram_init(void) if (num_devices > max_num_devices) { pr_warn("Invalid value for num_devices: %u\n", num_devices); - ret = -EINVAL; - goto out; + return -EINVAL; } zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_warn("Unable to get major number\n"); - ret = -EBUSY; - goto out; + return -EBUSY; } /* Allocate the device array and initialize each one */ zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); if (!zram_devices) { - ret = -ENOMEM; - goto unregister; + unregister_blkdev(zram_major, "zram"); + return -ENOMEM; } for (dev_id = 0; dev_id < num_devices; dev_id++) { ret = create_device(&zram_devices[dev_id], dev_id); if (ret) - goto free_devices; + goto out_error; } - pr_info("Created %u device(s) ...\n", num_devices); - + pr_info("Created %u device(s)\n", num_devices); return 0; -free_devices: - while (dev_id) - destroy_device(&zram_devices[--dev_id]); - kfree(zram_devices); -unregister: - unregister_blkdev(zram_major, "zram"); -out: +out_error: + destroy_devices(dev_id); return ret; } static void __exit zram_exit(void) { - int i; - struct zram *zram; - - for (i = 0; i < num_devices; i++) { - zram = &zram_devices[i]; - - destroy_device(zram); - /* - * Shouldn't access zram->disk after destroy_device - * because destroy_device already released zram->disk. - */ - zram_reset_device(zram, false); - } - - unregister_blkdev(zram_major, "zram"); - - kfree(zram_devices); - pr_debug("Cleanup done!\n"); + destroy_devices(num_devices); } module_init(zram_init); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index b05a816b09ac..17056e589146 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -100,24 +100,25 @@ struct zram_meta { struct zram { struct zram_meta *meta; - struct request_queue *queue; - struct gendisk *disk; struct zcomp *comp; - - /* Prevent concurrent execution of device init, reset and R/W request */ + struct gendisk *disk; + /* Prevent concurrent execution of device init */ struct rw_semaphore init_lock; /* - * This is the limit on amount of *uncompressed* worth of data - * we can store in a disk. + * the number of pages zram can consume for storing compressed data */ - u64 disksize; /* bytes */ + unsigned long limit_pages; int max_comp_streams; + struct zram_stats stats; + atomic_t refcount; /* refcount for zram_meta */ + /* wait all IO under all of cpu are done */ + wait_queue_head_t io_done; /* - * the number of pages zram can consume for storing compressed data + * This is the limit on amount of *uncompressed* worth of data + * we can store in a disk. */ - unsigned long limit_pages; - + u64 disksize; /* bytes */ char compressor[10]; }; #endif diff --git a/fs/dcache.c b/fs/dcache.c index e368d4f412f9..d04be762b216 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -400,19 +400,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list) * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ -static void d_lru_isolate(struct dentry *dentry) +static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); - list_del_init(&dentry->d_lru); + list_lru_isolate(lru, &dentry->d_lru); } -static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) +static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, + struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; - list_move_tail(&dentry->d_lru, list); + list_lru_isolate_move(lru, &dentry->d_lru, list); } /* @@ -869,8 +870,8 @@ static void shrink_dentry_list(struct list_head *list) } } -static enum lru_status -dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +static enum lru_status dentry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -890,7 +891,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * another pass through the LRU. */ if (dentry->d_lockref.count) { - d_lru_isolate(dentry); + d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } @@ -921,7 +922,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) return LRU_ROTATE; } - d_lru_shrink_move(dentry, freeable); + d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; @@ -930,30 +931,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @nr_to_scan : number of entries to try to free - * @nid: which node to scan for freeable entities + * @sc: shrink control, passed to list_lru_shrink_walk() * - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is - * done when we need more memory an called from the superblock shrinker + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This + * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid) +long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; - freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, - &dispose, &nr_to_scan); + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, + dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, - spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -966,7 +965,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; - d_lru_shrink_move(dentry, freeable); + d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 2bc2c87f35e7..5718cb9f7273 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -static void drop_slab(void) -{ - int nr_objects; - - do { - int nid; - - nr_objects = 0; - for_each_online_node(nid) - nr_objects += shrink_node_slabs(GFP_KERNEL, nid, - 1000, 1000); - } while (nr_objects > 10); -} - int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3e193cb36996..3aa17d4d1cfc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list) } -static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) +static enum lru_status gfs2_qd_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *dispose = arg; struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); @@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, if (qd->qd_lockref.count == 0) { lockref_mark_dead(&qd->qd_lockref); - list_move(&qd->qd_lru, dispose); + list_lru_isolate_move(lru, &qd->qd_lru, dispose); } spin_unlock(&qd->qd_lockref.lock); @@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, - &dispose, &sc->nr_to_scan); + freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, + gfs2_qd_isolate, &dispose); gfs2_qd_dispose(&dispose); @@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); + return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } struct shrinker gfs2_qd_shrinker = { diff --git a/fs/inode.c b/fs/inode.c index b7871577571d..86bfaca724db 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -672,8 +672,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -static enum lru_status -inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +static enum lru_status inode_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); @@ -691,7 +691,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); + list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; @@ -725,7 +725,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - list_move(&inode->i_lru, freeable); + list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); @@ -738,14 +738,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ -long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid) +long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; - freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, - &freeable, &nr_to_scan); + freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, + inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } diff --git a/fs/internal.h b/fs/internal.h index e9a61fe67575..d92c346a793d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -14,6 +14,7 @@ struct file_system_type; struct linux_binprm; struct path; struct mount; +struct shrink_control; /* * block_dev.c @@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; -extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid); +extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); /* @@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool); */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); -extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid); +extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); /* * read_write.c diff --git a/fs/proc/array.c b/fs/proc/array.c index bd117d065b82..a3ccf4c4ce70 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,6 +81,7 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <asm/pgtable.h> @@ -89,39 +90,18 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) { - int i; - char *buf, *end; - char *name; + char *buf; char tcomm[sizeof(p->comm)]; get_task_comm(tcomm, p); seq_puts(m, "Name:\t"); - end = m->buf + m->size; buf = m->buf + m->count; - name = tcomm; - i = sizeof(tcomm); - while (i && (buf < end)) { - unsigned char c = *name; - name++; - i--; - *buf = c; - if (!c) - break; - if (c == '\\') { - buf++; - if (buf < end) - *buf++ = c; - continue; - } - if (c == '\n') { - *buf++ = '\\'; - if (buf < end) - *buf++ = 'n'; - continue; - } - buf++; - } + + /* Ignore error for now */ + string_escape_str(tcomm, &buf, m->size - m->count, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + m->count = buf - m->buf; seq_putc(m, '\n'); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 7fea13229f33..de14e46fd807 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -122,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; - struct proc_dir_entry *de = PROC_I(inode)->pde; + struct proc_dir_entry *de = PDE(inode); if (de && de->nlink) set_nlink(inode, de->nlink); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8420a2f80811..13a50a32652d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode) put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ - de = PROC_I(inode)->pde; + de = PDE(inode); if (de) pde_put(de); head = PROC_I(inode)->sysctl; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0e36c1e49fe3..956b75d61809 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -732,6 +732,7 @@ enum clear_refs_types { CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_MM_HIWATER_RSS, CLEAR_REFS_LAST, }; @@ -907,6 +908,18 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .mm = mm, .private = &cp, }; + + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + down_write(&mm->mmap_sem); + reset_mm_hiwater_rss(mm); + up_write(&mm->mmap_sem); + goto out_mm; + } + down_read(&mm->mmap_sem); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { @@ -928,6 +941,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); +out_mm: mmput(mm); } put_task_struct(task); @@ -1543,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) for_each_node_state(nid, N_MEMORY) if (md->node[nid]) seq_printf(m, " N%d=%lu", nid, md->node[nid]); + + seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); m_cache_vma(m, vma); diff --git a/fs/select.c b/fs/select.c index 467bb1cb3ea5..f684c750e08a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, if (ret == -EINTR) { struct restart_block *restart_block; - restart_block = ¤t_thread_info()->restart_block; + restart_block = ¤t->restart_block; restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; diff --git a/fs/super.c b/fs/super.c index 05a021638b11..1facd2c282e5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -75,10 +75,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink, return SHRINK_STOP; if (sb->s_op->nr_cached_objects) - fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); + fs_objects = sb->s_op->nr_cached_objects(sb, sc); - inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); - dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); + inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); + dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; @@ -86,19 +86,23 @@ static unsigned long super_cache_scan(struct shrinker *shrink, /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches + * + * Ensure that we always scan at least one object - memcg kmem + * accounting uses this to fully empty the caches. */ - freed = prune_dcache_sb(sb, dentries, sc->nid); - freed += prune_icache_sb(sb, inodes, sc->nid); + sc->nr_to_scan = dentries + 1; + freed = prune_dcache_sb(sb, sc); + sc->nr_to_scan = inodes + 1; + freed += prune_icache_sb(sb, sc); if (fs_objects) { - fs_objects = mult_frac(sc->nr_to_scan, fs_objects, - total_objects); - freed += sb->s_op->free_cached_objects(sb, fs_objects, - sc->nid); + sc->nr_to_scan = fs_objects + 1; + freed += sb->s_op->free_cached_objects(sb, sc); } drop_super(sb); @@ -118,17 +122,14 @@ static unsigned long super_cache_count(struct shrinker *shrink, * scalability bottleneck. The counts could get updated * between super_cache_count and super_cache_scan anyway. * Call to super_cache_count with shrinker_rwsem held - * ensures the safety of call to list_lru_count_node() and + * ensures the safety of call to list_lru_shrink_count() and * s_op->nr_cached_objects(). */ if (sb->s_op && sb->s_op->nr_cached_objects) - total_objects = sb->s_op->nr_cached_objects(sb, - sc->nid); + total_objects = sb->s_op->nr_cached_objects(sb, sc); - total_objects += list_lru_count_node(&sb->s_dentry_lru, - sc->nid); - total_objects += list_lru_count_node(&sb->s_inode_lru, - sc->nid); + total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); + total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); total_objects = vfs_pressure_ratio(total_objects); return total_objects; @@ -191,9 +192,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); - if (list_lru_init(&s->s_dentry_lru)) + if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; - if (list_lru_init(&s->s_inode_lru)) + if (list_lru_init_memcg(&s->s_inode_lru)) goto fail; init_rwsem(&s->s_umount); @@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_shrink.scan_objects = super_cache_scan; s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE; + s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; return s; fail: @@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s) unregister_shrinker(&s->s_shrink); fs->kill_sb(s); + /* + * Since list_lru_destroy() may sleep, we cannot call it from + * put_super(), where we hold the sb_lock. Therefore we destroy + * the lru lists right now. + */ + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + put_filesystem(fs); put_super(s); } else { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bb502a391792..1790b00bea7a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1488,6 +1488,7 @@ xfs_buf_iomove( static enum lru_status xfs_buftarg_wait_rele( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) @@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele( */ atomic_set(&bp->b_lru_ref, 0); bp->b_state |= XFS_BSTATE_DISPOSE; - list_move(item, dispose); + list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lock); return LRU_REMOVED; } @@ -1546,6 +1547,7 @@ xfs_wait_buftarg( static enum lru_status xfs_buftarg_isolate( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { @@ -1569,7 +1571,7 @@ xfs_buftarg_isolate( } bp->b_state |= XFS_BSTATE_DISPOSE; - list_move(item, dispose); + list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lock); return LRU_REMOVED; } @@ -1583,10 +1585,9 @@ xfs_buftarg_shrink_scan( struct xfs_buftarg, bt_shrinker); LIST_HEAD(dispose); unsigned long freed; - unsigned long nr_to_scan = sc->nr_to_scan; - freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, - &dispose, &nr_to_scan); + freed = list_lru_shrink_walk(&btp->bt_lru, sc, + xfs_buftarg_isolate, &dispose); while (!list_empty(&dispose)) { struct xfs_buf *bp; @@ -1605,7 +1606,7 @@ xfs_buftarg_shrink_count( { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - return list_lru_count_node(&btp->bt_lru, sc->nid); + return list_lru_shrink_count(&btp->bt_lru, sc); } void diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 3e8186279541..53cc2aaf8d2b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -430,6 +430,7 @@ struct xfs_qm_isolate { static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __releases(lru_lock) __acquires(lru_lock) @@ -450,7 +451,7 @@ xfs_qm_dquot_isolate( XFS_STATS_INC(xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); - list_del_init(&dqp->q_lru); + list_lru_isolate(lru, &dqp->q_lru); XFS_STATS_DEC(xs_qm_dquot_unused); return LRU_REMOVED; } @@ -494,7 +495,7 @@ xfs_qm_dquot_isolate( xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_lru, &isol->dispose); + list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); XFS_STATS_INC(xs_qm_dqreclaims); @@ -523,7 +524,6 @@ xfs_qm_shrink_scan( struct xfs_qm_isolate isol; unsigned long freed; int error; - unsigned long nr_to_scan = sc->nr_to_scan; if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; @@ -531,8 +531,8 @@ xfs_qm_shrink_scan( INIT_LIST_HEAD(&isol.buffers); INIT_LIST_HEAD(&isol.dispose); - freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, - &nr_to_scan); + freed = list_lru_shrink_walk(&qi->qi_lru, sc, + xfs_qm_dquot_isolate, &isol); error = xfs_buf_delwri_submit(&isol.buffers); if (error) @@ -557,7 +557,7 @@ xfs_qm_shrink_count( struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); - return list_lru_count_node(&qi->qi_lru, sc->nid); + return list_lru_shrink_count(&qi->qi_lru, sc); } /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f2449fd86926..8fcc4ccc5c79 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1537,7 +1537,7 @@ xfs_fs_mount( static long xfs_fs_nr_cached_objects( struct super_block *sb, - int nid) + struct shrink_control *sc) { return xfs_reclaim_inodes_count(XFS_M(sb)); } @@ -1545,10 +1545,9 @@ xfs_fs_nr_cached_objects( static long xfs_fs_free_cached_objects( struct super_block *sb, - long nr_to_scan, - int nid) + struct shrink_control *sc) { - return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); } static const struct super_operations xfs_super_operations = { diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h index 9318a87ee39a..a8f344363e77 100644 --- a/include/acpi/acoutput.h +++ b/include/acpi/acoutput.h @@ -240,7 +240,7 @@ /* * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header, * define it now. This is the case where there the compiler does not support - * a __FUNCTION__ macro or equivalent. + * a __func__ macro or equivalent. */ #ifndef ACPI_GET_FUNCTION_NAME #define ACPI_GET_FUNCTION_NAME _acpi_function_name @@ -249,12 +249,12 @@ * The Name parameter should be the procedure name as a quoted string. * The function name is also used by the function exit macros below. * Note: (const char) is used to be compatible with the debug interfaces - * and macros such as __FUNCTION__. + * and macros such as __func__. */ #define ACPI_FUNCTION_NAME(name) static const char _acpi_function_name[] = #name; #else -/* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */ +/* Compiler supports __func__ (or equivalent) -- Ignore this macro */ #define ACPI_FUNCTION_NAME(name) #endif /* ACPI_GET_FUNCTION_NAME */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 129de9204d18..4d46085c1b90 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -244,10 +244,6 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) # define pte_accessible(mm, pte) ((void)(pte), 1) #endif -#ifndef pte_present_nonuma -#define pte_present_nonuma(pte) pte_present(pte) -#endif - #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -673,155 +669,24 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } -#ifdef CONFIG_NUMA_BALANCING -/* - * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that - * is protected for PROT_NONE and a NUMA hinting fault entry. If the - * architecture defines __PAGE_PROTNONE then it should take that into account - * but those that do not can rely on the fact that the NUMA hinting scanner - * skips inaccessible VMAs. - * - * pte/pmd_present() returns true if pte/pmd_numa returns true. Page - * fault triggers on those regions if pte/pmd_numa returns true - * (because _PAGE_PRESENT is not set). - */ -#ifndef pte_numa -static inline int pte_numa(pte_t pte) -{ - return ptenuma_flags(pte) == _PAGE_NUMA; -} -#endif - -#ifndef pmd_numa -static inline int pmd_numa(pmd_t pmd) -{ - return pmdnuma_flags(pmd) == _PAGE_NUMA; -} -#endif - +#ifndef CONFIG_NUMA_BALANCING /* - * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically - * because they're called by the NUMA hinting minor page fault. If we - * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler - * would be forced to set it later while filling the TLB after we - * return to userland. That would trigger a second write to memory - * that we optimize away by setting _PAGE_ACCESSED here. + * Technically a PTE can be PROTNONE even when not doing NUMA balancing but + * the only case the kernel cares is for NUMA balancing and is only ever set + * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked + * _PAGE_PROTNONE so by by default, implement the helper as "always no". It + * is the responsibility of the caller to distinguish between PROT_NONE + * protections and NUMA hinting fault protections. */ -#ifndef pte_mknonnuma -static inline pte_t pte_mknonnuma(pte_t pte) -{ - pteval_t val = pte_val(pte); - - val &= ~_PAGE_NUMA; - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); - return __pte(val); -} -#endif - -#ifndef pmd_mknonnuma -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - pmdval_t val = pmd_val(pmd); - - val &= ~_PAGE_NUMA; - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); - - return __pmd(val); -} -#endif - -#ifndef pte_mknuma -static inline pte_t pte_mknuma(pte_t pte) -{ - pteval_t val = pte_val(pte); - - VM_BUG_ON(!(val & _PAGE_PRESENT)); - - val &= ~_PAGE_PRESENT; - val |= _PAGE_NUMA; - - return __pte(val); -} -#endif - -#ifndef ptep_set_numa -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_t ptent = *ptep; - - ptent = pte_mknuma(ptent); - set_pte_at(mm, addr, ptep, ptent); - return; -} -#endif - -#ifndef pmd_mknuma -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - pmdval_t val = pmd_val(pmd); - - val &= ~_PAGE_PRESENT; - val |= _PAGE_NUMA; - - return __pmd(val); -} -#endif - -#ifndef pmdp_set_numa -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - pmd_t pmd = *pmdp; - - pmd = pmd_mknuma(pmd); - set_pmd_at(mm, addr, pmdp, pmd); - return; -} -#endif -#else -static inline int pmd_numa(pmd_t pmd) +static inline int pte_protnone(pte_t pte) { return 0; } -static inline int pte_numa(pte_t pte) +static inline int pmd_protnone(pmd_t pmd) { return 0; } - -static inline pte_t pte_mknonnuma(pte_t pte) -{ - return pte; -} - -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - return pmd; -} - -static inline pte_t pte_mknuma(pte_t pte) -{ - return pte; -} - -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - return; -} - - -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - return pmd; -} - -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - return ; -} #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 202e4034fe26..5f5c00de39f0 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -160,18 +160,18 @@ extern int bitmap_parselist(const char *buf, unsigned long *maskp, extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); extern void bitmap_remap(unsigned long *dst, const unsigned long *src, - const unsigned long *old, const unsigned long *new, int bits); + const unsigned long *old, const unsigned long *new, unsigned int nbits); extern int bitmap_bitremap(int oldbit, const unsigned long *old, const unsigned long *new, int bits); extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, - const unsigned long *relmap, int bits); + const unsigned long *relmap, unsigned int bits); extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, - int sz, int bits); + unsigned int sz, unsigned int nbits); extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); -extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); +extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); extern int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); @@ -185,33 +185,33 @@ extern int bitmap_print_to_pagebuf(bool list, char *buf, #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) -static inline void bitmap_zero(unsigned long *dst, int nbits) +static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = 0UL; else { - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); memset(dst, 0, len); } } -static inline void bitmap_fill(unsigned long *dst, int nbits) +static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) { - size_t nlongs = BITS_TO_LONGS(nbits); + unsigned int nlongs = BITS_TO_LONGS(nbits); if (!small_const_nbits(nbits)) { - int len = (nlongs - 1) * sizeof(unsigned long); + unsigned int len = (nlongs - 1) * sizeof(unsigned long); memset(dst, 0xff, len); } dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); } static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, - int nbits) + unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src; else { - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); memcpy(dst, src, len); } } diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index b950e9d6008b..ff9044286d88 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -905,13 +905,13 @@ static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp) } #define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS) -static inline void __cpus_setall(cpumask_t *dstp, int nbits) +static inline void __cpus_setall(cpumask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS) -static inline void __cpus_clear(cpumask_t *dstp, int nbits) +static inline void __cpus_clear(cpumask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } @@ -927,21 +927,21 @@ static inline int __cpu_test_and_set(int cpu, cpumask_t *addr) #define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS) static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS) static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS) static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -949,40 +949,40 @@ static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, #define cpus_andnot(dst, src1, src2) \ __cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS) static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS) static inline int __cpus_equal(const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS) static inline int __cpus_intersects(const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS) static inline int __cpus_subset(const cpumask_t *src1p, - const cpumask_t *src2p, int nbits) + const cpumask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define cpus_empty(src) __cpus_empty(&(src), NR_CPUS) -static inline int __cpus_empty(const cpumask_t *srcp, int nbits) +static inline int __cpus_empty(const cpumask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS) -static inline int __cpus_weight(const cpumask_t *srcp, int nbits) +static inline int __cpus_weight(const cpumask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h index 2cd9f1cf9fa3..f4754282c9c2 100644 --- a/include/linux/cryptohash.h +++ b/include/linux/cryptohash.h @@ -1,6 +1,8 @@ #ifndef __CRYPTOHASH_H #define __CRYPTOHASH_H +#include <uapi/linux/types.h> + #define SHA_DIGEST_WORDS 5 #define SHA_MESSAGE_BYTES (512 /*bits*/ / 8) #define SHA_WORKSPACE_WORDS 16 diff --git a/include/linux/fs.h b/include/linux/fs.h index ec0f1dc66b9b..e49f10cc8a73 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1655,8 +1655,10 @@ struct super_operations { struct dquot **(*get_dquots)(struct inode *); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); - long (*nr_cached_objects)(struct super_block *, int); - long (*free_cached_objects)(struct super_block *, long, int); + long (*nr_cached_objects)(struct super_block *, + struct shrink_control *); + long (*free_cached_objects)(struct super_block *, + struct shrink_control *); }; /* diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 3037fc085e8e..d3d43ecf148c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -193,6 +193,9 @@ extern struct task_group root_task_group; .nr_cpus_allowed= NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ .se = { \ .group_node = LIST_HEAD_INIT(tsk.se.group_node), \ }, \ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e42e7dc34c68..d6d630d31ef3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -800,9 +800,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) -/* Trap pasters of __FUNCTION__ at compile-time */ -#define __FUNCTION__ (__func__) - /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index f3434533fbf8..2a6b9947aaa3 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -9,6 +9,9 @@ #include <linux/list.h> #include <linux/nodemask.h> +#include <linux/shrinker.h> + +struct mem_cgroup; /* list_lru_walk_cb has to always return one of those */ enum lru_status { @@ -21,24 +24,45 @@ enum lru_status { internally, but has to return locked. */ }; -struct list_lru_node { - spinlock_t lock; +struct list_lru_one { struct list_head list; - /* kept as signed so we can catch imbalance bugs */ + /* may become negative during memcg reparenting */ long nr_items; +}; + +struct list_lru_memcg { + /* array of per cgroup lists, indexed by memcg_cache_id */ + struct list_lru_one *lru[0]; +}; + +struct list_lru_node { + /* protects all lists on the node, including per cgroup */ + spinlock_t lock; + /* global list, used for the root cgroup in cgroup aware lrus */ + struct list_lru_one lru; +#ifdef CONFIG_MEMCG_KMEM + /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ + struct list_lru_memcg *memcg_lrus; +#endif } ____cacheline_aligned_in_smp; struct list_lru { struct list_lru_node *node; - nodemask_t active_nodes; +#ifdef CONFIG_MEMCG_KMEM + struct list_head list; +#endif }; void list_lru_destroy(struct list_lru *lru); -int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key); -static inline int list_lru_init(struct list_lru *lru) -{ - return list_lru_init_key(lru, NULL); -} +int __list_lru_init(struct list_lru *lru, bool memcg_aware, + struct lock_class_key *key); + +#define list_lru_init(lru) __list_lru_init((lru), false, NULL) +#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key)) +#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL) + +int memcg_update_all_list_lrus(int num_memcgs); +void memcg_drain_all_list_lrus(int src_idx, int dst_idx); /** * list_lru_add: add an element to the lru list's tail @@ -72,32 +96,48 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item); bool list_lru_del(struct list_lru *lru, struct list_head *item); /** - * list_lru_count_node: return the number of objects currently held by @lru + * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. * @nid: the node id to count from. + * @memcg: the cgroup to count from. * * Always return a non-negative number, 0 for empty lists. There is no * guarantee that the list is not updated while the count is being computed. * Callers that want such a guarantee need to provide an outer lock. */ +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg); unsigned long list_lru_count_node(struct list_lru *lru, int nid); + +static inline unsigned long list_lru_shrink_count(struct list_lru *lru, + struct shrink_control *sc) +{ + return list_lru_count_one(lru, sc->nid, sc->memcg); +} + static inline unsigned long list_lru_count(struct list_lru *lru) { long count = 0; int nid; - for_each_node_mask(nid, lru->active_nodes) + for_each_node_state(nid, N_NORMAL_MEMORY) count += list_lru_count_node(lru, nid); return count; } -typedef enum lru_status -(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg); +void list_lru_isolate(struct list_lru_one *list, struct list_head *item); +void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, + struct list_head *head); + +typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, + struct list_lru_one *list, spinlock_t *lock, void *cb_arg); + /** - * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items. + * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. + * @memcg: the cgroup to scan from. * @isolate: callback function that is resposible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate @@ -115,18 +155,30 @@ typedef enum lru_status * * Return value: the number of objects effectively removed from the LRU. */ +unsigned long list_lru_walk_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); static inline unsigned long +list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, + list_lru_walk_cb isolate, void *cb_arg) +{ + return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg, + &sc->nr_to_scan); +} + +static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) { long isolated = 0; int nid; - for_each_node_mask(nid, lru->active_nodes) { + for_each_node_state(nid, N_NORMAL_MEMORY) { isolated += list_lru_walk_node(lru, nid, isolate, cb_arg, &nr_to_walk); if (nr_to_walk <= 0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6cfd934c7c9b..72dff5fb0d0c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -398,7 +398,9 @@ static inline void sock_release_memcg(struct sock *sk) #ifdef CONFIG_MEMCG_KMEM extern struct static_key memcg_kmem_enabled_key; -extern int memcg_limited_groups_array_size; +extern int memcg_nr_cache_ids; +extern void memcg_get_cache_ids(void); +extern void memcg_put_cache_ids(void); /* * Helper macro to loop through all memcg-specific caches. Callers must still @@ -406,13 +408,15 @@ extern int memcg_limited_groups_array_size; * the slab_mutex must be held when looping through those caches */ #define for_each_memcg_cache_index(_idx) \ - for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++) + for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++) static inline bool memcg_kmem_enabled(void) { return static_key_false(&memcg_kmem_enabled_key); } +bool memcg_kmem_is_active(struct mem_cgroup *memcg); + /* * In general, we'll do everything in our power to not incur in any overhead * for non-memcg users for the kmem functions. Not even a function call, if we @@ -432,11 +436,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); -void memcg_update_array_size(int num_groups); - struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); +struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); + int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); @@ -533,6 +537,13 @@ static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) if (memcg_kmem_enabled()) __memcg_kmem_put_cache(cachep); } + +static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + if (!memcg_kmem_enabled()) + return NULL; + return __mem_cgroup_from_kmem(ptr); +} #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -542,6 +553,11 @@ static inline bool memcg_kmem_enabled(void) return false; } +static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) +{ + return false; +} + static inline bool memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) { @@ -562,6 +578,14 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } +static inline void memcg_get_cache_ids(void) +{ +} + +static inline void memcg_put_cache_ids(void) +{ +} + static inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { @@ -571,6 +595,11 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) { } + +static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + return NULL; +} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fab9b32ace8e..78baed5f2952 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -67,7 +67,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #ifdef CONFIG_NUMA_BALANCING extern bool pmd_trans_migrating(pmd_t pmd); -extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd); extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); extern bool migrate_ratelimited(int node); @@ -76,9 +75,6 @@ static inline bool pmd_trans_migrating(pmd_t pmd) { return false; } -static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) -{ -} static inline int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node) { diff --git a/include/linux/mm.h b/include/linux/mm.h index a4d24f3c5430..9bee7ec0c31f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1408,6 +1408,11 @@ static inline void update_hiwater_vm(struct mm_struct *mm) mm->hiwater_vm = mm->total_vm; } +static inline void reset_mm_hiwater_rss(struct mm_struct *mm) +{ + mm->hiwater_rss = get_mm_rss(mm); +} + static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { @@ -1447,13 +1452,15 @@ static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PMD_FOLDED +#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } +static inline void mm_nr_pmds_init(struct mm_struct *mm) {} + static inline unsigned long mm_nr_pmds(struct mm_struct *mm) { return 0; @@ -1465,6 +1472,11 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); +static inline void mm_nr_pmds_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_pmds, 0); +} + static inline unsigned long mm_nr_pmds(struct mm_struct *mm) { return atomic_long_read(&mm->nr_pmds); @@ -2168,9 +2180,8 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); #endif -unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, - unsigned long nr_scanned, - unsigned long nr_eligible); +void drop_slab(void); +void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 83a6aeda899d..21cef483dc1b 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -120,13 +120,13 @@ static inline void __node_clear(int node, volatile nodemask_t *dstp) } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) -static inline void __nodes_setall(nodemask_t *dstp, int nbits) +static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) -static inline void __nodes_clear(nodemask_t *dstp, int nbits) +static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } @@ -144,7 +144,7 @@ static inline int __node_test_and_set(int node, nodemask_t *addr) #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -152,7 +152,7 @@ static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -160,7 +160,7 @@ static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -168,7 +168,7 @@ static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } @@ -176,7 +176,7 @@ static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static inline void __nodes_complement(nodemask_t *dstp, - const nodemask_t *srcp, int nbits) + const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } @@ -184,7 +184,7 @@ static inline void __nodes_complement(nodemask_t *dstp, #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_equal(const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } @@ -192,7 +192,7 @@ static inline int __nodes_equal(const nodemask_t *src1p, #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_intersects(const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } @@ -200,25 +200,25 @@ static inline int __nodes_intersects(const nodemask_t *src1p, #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_subset(const nodemask_t *src1p, - const nodemask_t *src2p, int nbits) + const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) -static inline int __nodes_empty(const nodemask_t *srcp, int nbits) +static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) -static inline int __nodes_full(const nodemask_t *srcp, int nbits) +static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) -static inline int __nodes_weight(const nodemask_t *srcp, int nbits) +static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } diff --git a/include/linux/printk.h b/include/linux/printk.h index 4d5bf5726578..baa3f97d8ce8 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -417,9 +417,9 @@ enum { DUMP_PREFIX_ADDRESS, DUMP_PREFIX_OFFSET }; -extern void hex_dump_to_buffer(const void *buf, size_t len, - int rowsize, int groupsize, - char *linebuf, size_t linebuflen, bool ascii); +extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, + int groupsize, char *linebuf, size_t linebuflen, + bool ascii); #ifdef CONFIG_PRINTK extern void print_hex_dump(const char *level, const char *prefix_str, int prefix_type, int rowsize, int groupsize, diff --git a/include/linux/sched.h b/include/linux/sched.h index 8db31ef98d2f..048b91b983ed 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1370,6 +1370,8 @@ struct task_struct { unsigned long atomic_flags; /* Flags needing atomic access. */ + struct restart_block restart_block; + pid_t pid; pid_t tgid; @@ -2145,6 +2147,7 @@ extern unsigned long long notrace sched_clock(void); */ extern u64 cpu_clock(int cpu); extern u64 local_clock(void); +extern u64 running_clock(void); extern u64 sched_clock_cpu(int cpu); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index f4aee75f00b1..4fcacd915d45 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -20,6 +20,9 @@ struct shrink_control { /* current node being shrunk (for NUMA aware shrinkers) */ int nid; + + /* current memcg being shrunk (for memcg aware shrinkers) */ + struct mem_cgroup *memcg; }; #define SHRINK_STOP (~0UL) @@ -61,7 +64,8 @@ struct shrinker { #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ /* Flags */ -#define SHRINKER_NUMA_AWARE (1 << 0) +#define SHRINKER_NUMA_AWARE (1 << 0) +#define SHRINKER_MEMCG_AWARE (1 << 1) extern int register_shrinker(struct shrinker *); extern void unregister_shrinker(struct shrinker *); diff --git a/include/linux/slab.h b/include/linux/slab.h index 2e3b448cfa2d..ed2ffaab59ea 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -115,13 +115,12 @@ int slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); -#ifdef CONFIG_MEMCG_KMEM -void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); -void memcg_destroy_kmem_caches(struct mem_cgroup *); -#endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); -void kmem_cache_free(struct kmem_cache *, void *); + +void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); +void memcg_deactivate_kmem_caches(struct mem_cgroup *); +void memcg_destroy_kmem_caches(struct mem_cgroup *); /* * Please use this macro to create slab caches. Simply specify the @@ -288,6 +287,7 @@ static __always_inline int kmalloc_index(size_t size) void *__kmalloc(size_t size, gfp_t flags); void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); +void kmem_cache_free(struct kmem_cache *, void *); #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node); @@ -473,14 +473,14 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) #ifndef ARCH_SLAB_MINALIGN #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif + +struct memcg_cache_array { + struct rcu_head rcu; + struct kmem_cache *entries[0]; +}; + /* * This is the main placeholder for memcg-related information in kmem caches. - * struct kmem_cache will hold a pointer to it, so the memory cost while - * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it - * would otherwise be if that would be bundled in kmem_cache: we'll need an - * extra pointer chase. But the trade off clearly lays in favor of not - * penalizing non-users. - * * Both the root cache and the child caches will have it. For the root cache, * this will hold a dynamically allocated array large enough to hold * information about the currently limited memcgs in the system. To allow the @@ -491,14 +491,15 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * * @memcg: pointer to the memcg this cache belongs to * @root_cache: pointer to the global, root cache, this cache was derived from + * + * Both root and child caches of the same kind are linked into a list chained + * through @list. */ struct memcg_cache_params { bool is_root_cache; + struct list_head list; union { - struct { - struct rcu_head rcu_head; - struct kmem_cache *memcg_caches[0]; - }; + struct memcg_cache_array __rcu *memcg_caches; struct { struct mem_cgroup *memcg; struct kmem_cache *root_cache; diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index b869d1662ba3..33d049066c3d 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -70,7 +70,7 @@ struct kmem_cache { int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ #ifdef CONFIG_MEMCG_KMEM - struct memcg_cache_params *memcg_params; + struct memcg_cache_params memcg_params; #endif struct kmem_cache_node *node[MAX_NUMNODES]; diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index d82abd40a3c0..9abf04ed0999 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -85,7 +85,7 @@ struct kmem_cache { struct kobject kobj; /* For sysfs */ #endif #ifdef CONFIG_MEMCG_KMEM - struct memcg_cache_params *memcg_params; + struct memcg_cache_params memcg_params; int max_attr_size; /* for propagation, maximum size of a stored attr */ #ifdef CONFIG_SYSFS struct kset *memcg_kset; diff --git a/include/linux/string.h b/include/linux/string.h index 2e22a2e58f3a..b9bc9a5d9e21 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -40,9 +40,6 @@ extern int strcmp(const char *,const char *); #ifndef __HAVE_ARCH_STRNCMP extern int strncmp(const char *,const char *,__kernel_size_t); #endif -#ifndef __HAVE_ARCH_STRNICMP -#define strnicmp strncasecmp -#endif #ifndef __HAVE_ARCH_STRCASECMP extern int strcasecmp(const char *s1, const char *s2); #endif diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 6eb567ac56bc..657571817260 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -10,8 +10,8 @@ enum string_size_units { STRING_UNITS_2, /* use binary powers of 2^10 */ }; -int string_get_size(u64 size, enum string_size_units units, - char *buf, int len); +void string_get_size(u64 size, enum string_size_units units, + char *buf, int len); #define UNESCAPE_SPACE 0x01 #define UNESCAPE_OCTAL 0x02 diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 831a3168ab35..cedf3d3c373f 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present_nonuma(pte); + return !pte_none(pte) && !pte_present(pte); } #endif diff --git a/include/linux/types.h b/include/linux/types.h index 62323825cff9..6747247e3f9f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -135,12 +135,9 @@ typedef unsigned long blkcnt_t; #endif /* - * The type of an index into the pagecache. Use a #define so asm/types.h - * can override it. + * The type of an index into the pagecache. */ -#ifndef pgoff_t #define pgoff_t unsigned long -#endif /* A dma_addr_t can hold any valid DMA or bus address for the platform */ #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT diff --git a/include/linux/zpool.h b/include/linux/zpool.h index f14bd75f08b3..56529b34dc63 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -36,7 +36,8 @@ enum zpool_mapmode { ZPOOL_MM_DEFAULT = ZPOOL_MM_RW }; -struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops); +struct zpool *zpool_create_pool(char *type, char *name, + gfp_t gfp, struct zpool_ops *ops); char *zpool_get_type(struct zpool *pool); @@ -80,7 +81,7 @@ struct zpool_driver { atomic_t refcount; struct list_head list; - void *(*create)(gfp_t gfp, struct zpool_ops *ops); + void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops); void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 05c214760977..3283c6a55425 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -36,7 +36,7 @@ enum zs_mapmode { struct zs_pool; -struct zs_pool *zs_create_pool(gfp_t flags); +struct zs_pool *zs_create_pool(char *name, gfp_t flags); void zs_destroy_pool(struct zs_pool *pool); unsigned long zs_malloc(struct zs_pool *pool, size_t size); diff --git a/include/net/sock.h b/include/net/sock.h index e13824570b0f..ab186b1d31ff 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1077,11 +1077,6 @@ static inline bool memcg_proto_active(struct cg_proto *cg_proto) return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); } -static inline bool memcg_proto_activated(struct cg_proto *cg_proto) -{ - return test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags); -} - #ifdef SOCK_REFCNT_DEBUG static inline void sk_refcnt_debug_inc(struct sock *sk) { diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 0d11c3dcd3a1..9cd8b21dddbe 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -67,7 +67,7 @@ enum mpol_rebind_step { #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ -#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ +#define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ #endif /* _UAPI_LINUX_MEMPOLICY_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 04cfe8ace520..d5f6ec251fb2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4373,16 +4373,20 @@ static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; percpu_ref_exit(&css->refcnt); - if (css->ss) { + if (ss) { /* css free path */ + int id = css->id; + if (css->parent) css_put(css->parent); - css->ss->css_free(css); + ss->css_free(css); + cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); } else { /* cgroup free path */ @@ -4434,7 +4438,7 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ - cgroup_idr_remove(&ss->css_idr, css->id); + cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); } else { diff --git a/kernel/compat.c b/kernel/compat.c index ebb3c369d03d..24f00610c575 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -276,8 +276,7 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, * core implementation decides to return random nonsense. */ if (ret == -ERESTART_RESTARTBLOCK) { - struct restart_block *restart - = ¤t_thread_info()->restart_block; + struct restart_block *restart = ¤t->restart_block; restart->fn = compat_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; @@ -860,7 +859,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags, return -EFAULT; if (err == -ERESTART_RESTARTBLOCK) { - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = compat_clock_nanosleep_restart; restart->nanosleep.compat_rmtp = rmtp; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64b257f6bca2..c54a1dae6c11 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2400,7 +2400,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk) */ } -void cpuset_init_current_mems_allowed(void) +void __init cpuset_init_current_mems_allowed(void) { nodes_setall(current->mems_allowed); } diff --git a/kernel/fork.c b/kernel/fork.c index 66e19c251581..cf65139615a0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -555,9 +555,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); -#ifndef __PAGETABLE_PMD_FOLDED - atomic_long_set(&mm->nr_pmds, 0); -#endif + mm_nr_pmds_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; diff --git a/kernel/futex.c b/kernel/futex.c index 4eeb63de7e54..2a5e3830e953 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2217,7 +2217,7 @@ retry: if (!abs_time) goto out; - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = futex_wait_restart; restart->futex.uaddr = uaddr; restart->futex.val = val; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02d6b6d28796..c06df7de0963 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -935,8 +935,8 @@ static int __init ignore_loglevel_setup(char *str) early_param("ignore_loglevel", ignore_loglevel_setup); module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" - "print all kernel messages to the console."); +MODULE_PARM_DESC(ignore_loglevel, + "ignore loglevel setting (prints all kernel messages to the console)"); #ifdef CONFIG_BOOT_PRINTK_DELAY @@ -1419,16 +1419,16 @@ static void call_console_drivers(int level, const char *text, size_t len) } /* - * Zap console related locks when oopsing. Only zap at most once - * every 10 seconds, to leave time for slow consoles to print a - * full oops. + * Zap console related locks when oopsing. + * To leave time for slow consoles to print a full oops, + * only zap at most once every 30 seconds. */ static void zap_locks(void) { static unsigned long oops_timestamp; if (time_after_eq(jiffies, oops_timestamp) && - !time_after(jiffies, oops_timestamp + 30 * HZ)) + !time_after(jiffies, oops_timestamp + 30 * HZ)) return; oops_timestamp = jiffies; diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c27e4f8f4879..c0a205101c23 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -420,3 +420,16 @@ u64 local_clock(void) EXPORT_SYMBOL_GPL(cpu_clock); EXPORT_SYMBOL_GPL(local_clock); + +/* + * Running clock - returns the time that has elapsed while a guest has been + * running. + * On a guest this value should be local_clock minus the time the guest was + * suspended by the hypervisor (for any reason). + * On bare metal this function should return the same as local_clock. + * Architectures and sub-architectures can override this. + */ +u64 __weak running_clock(void) +{ + return local_clock(); +} diff --git a/kernel/signal.c b/kernel/signal.c index 16a305295256..33a52759cc0e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2501,7 +2501,7 @@ EXPORT_SYMBOL(unblock_all_signals); */ SYSCALL_DEFINE0(restart_syscall) { - struct restart_block *restart = ¤t_thread_info()->restart_block; + struct restart_block *restart = ¤t->restart_block; return restart->fn(restart); } diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index a7077d3ae52f..1b001ed1edb9 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -788,7 +788,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = alarm_timer_nsleep_restart; restart->nanosleep.clockid = type; restart->nanosleep.expires = exp.tv64; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3f5e183c3d97..bee0c1f78091 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1583,7 +1583,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, goto out; } - restart = ¤t_thread_info()->restart_block; + restart = ¤t->restart_block; restart->fn = hrtimer_nanosleep_restart; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.rmtp = rmtp; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index a16b67859e2a..0075da74abf0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1334,8 +1334,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, struct timespec *rqtp, struct timespec __user *rmtp) { - struct restart_block *restart_block = - ¤t_thread_info()->restart_block; + struct restart_block *restart_block = ¤t->restart_block; struct itimerspec it; int error; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 70bf11815f84..3174bf8e3538 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -154,7 +154,7 @@ static int get_softlockup_thresh(void) */ static unsigned long get_timestamp(void) { - return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ + return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ } static void set_sample_period(void) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e5ea3ab856bf..79a9bb67aeaf 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1580,6 +1580,9 @@ config ASYNC_RAID6_TEST If unsure, say N. +config TEST_HEXDUMP + tristate "Test functions located in the hexdump module at runtime" + config TEST_STRING_HELPERS tristate "Test functions located in the string_helpers module at runtime" diff --git a/lib/Makefile b/lib/Makefile index 25c061f77df7..e456defd1021 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -23,12 +23,14 @@ lib-y += kobject.o klist.o obj-y += lockref.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ - bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ + bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o +obj-y += hexdump.o +obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o obj-y += kstrtox.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LKM) += test_module.o diff --git a/lib/bitmap.c b/lib/bitmap.c index 324ea9eab8c1..ad161a6c82db 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -744,10 +744,10 @@ EXPORT_SYMBOL(bitmap_parselist_user); /** * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap * @buf: pointer to a bitmap - * @pos: a bit position in @buf (0 <= @pos < @bits) - * @bits: number of valid bit positions in @buf + * @pos: a bit position in @buf (0 <= @pos < @nbits) + * @nbits: number of valid bit positions in @buf * - * Map the bit at position @pos in @buf (of length @bits) to the + * Map the bit at position @pos in @buf (of length @nbits) to the * ordinal of which set bit it is. If it is not set or if @pos * is not a valid bit position, map to -1. * @@ -759,56 +759,40 @@ EXPORT_SYMBOL(bitmap_parselist_user); * * The bit positions 0 through @bits are valid positions in @buf. */ -static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits) +static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits) { - int i, ord; - - if (pos < 0 || pos >= bits || !test_bit(pos, buf)) + if (pos >= nbits || !test_bit(pos, buf)) return -1; - i = find_first_bit(buf, bits); - ord = 0; - while (i < pos) { - i = find_next_bit(buf, bits, i + 1); - ord++; - } - BUG_ON(i != pos); - - return ord; + return __bitmap_weight(buf, pos); } /** * bitmap_ord_to_pos - find position of n-th set bit in bitmap * @buf: pointer to bitmap * @ord: ordinal bit position (n-th set bit, n >= 0) - * @bits: number of valid bit positions in @buf + * @nbits: number of valid bit positions in @buf * * Map the ordinal offset of bit @ord in @buf to its position in @buf. - * Value of @ord should be in range 0 <= @ord < weight(buf), else - * results are undefined. + * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord + * >= weight(buf), returns @nbits. * * If for example, just bits 4 through 7 are set in @buf, then @ord * values 0 through 3 will get mapped to 4 through 7, respectively, - * and all other @ord values return undefined values. When @ord value 3 + * and all other @ord values returns @nbits. When @ord value 3 * gets mapped to (returns) @pos value 7 in this example, that means * that the 3rd set bit (starting with 0th) is at position 7 in @buf. * - * The bit positions 0 through @bits are valid positions in @buf. + * The bit positions 0 through @nbits-1 are valid positions in @buf. */ -int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) +unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits) { - int pos = 0; - - if (ord >= 0 && ord < bits) { - int i; + unsigned int pos; - for (i = find_first_bit(buf, bits); - i < bits && ord > 0; - i = find_next_bit(buf, bits, i + 1)) - ord--; - if (i < bits && ord == 0) - pos = i; - } + for (pos = find_first_bit(buf, nbits); + pos < nbits && ord; + pos = find_next_bit(buf, nbits, pos + 1)) + ord--; return pos; } @@ -819,7 +803,7 @@ int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) * @src: subset to be remapped * @old: defines domain of map * @new: defines range of map - * @bits: number of bits in each of these bitmaps + * @nbits: number of bits in each of these bitmaps * * Let @old and @new define a mapping of bit positions, such that * whatever position is held by the n-th set bit in @old is mapped @@ -847,22 +831,22 @@ int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) */ void bitmap_remap(unsigned long *dst, const unsigned long *src, const unsigned long *old, const unsigned long *new, - int bits) + unsigned int nbits) { - int oldbit, w; + unsigned int oldbit, w; if (dst == src) /* following doesn't handle inplace remaps */ return; - bitmap_zero(dst, bits); + bitmap_zero(dst, nbits); - w = bitmap_weight(new, bits); - for_each_set_bit(oldbit, src, bits) { - int n = bitmap_pos_to_ord(old, oldbit, bits); + w = bitmap_weight(new, nbits); + for_each_set_bit(oldbit, src, nbits) { + int n = bitmap_pos_to_ord(old, oldbit, nbits); if (n < 0 || w == 0) set_bit(oldbit, dst); /* identity map */ else - set_bit(bitmap_ord_to_pos(new, n % w, bits), dst); + set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst); } } EXPORT_SYMBOL(bitmap_remap); @@ -1006,9 +990,9 @@ EXPORT_SYMBOL(bitmap_bitremap); * All bits in @dst not set by the above rule are cleared. */ void bitmap_onto(unsigned long *dst, const unsigned long *orig, - const unsigned long *relmap, int bits) + const unsigned long *relmap, unsigned int bits) { - int n, m; /* same meaning as in above comment */ + unsigned int n, m; /* same meaning as in above comment */ if (dst == orig) /* following doesn't handle inplace mappings */ return; @@ -1039,22 +1023,22 @@ EXPORT_SYMBOL(bitmap_onto); * @dst: resulting smaller bitmap * @orig: original larger bitmap * @sz: specified size - * @bits: number of bits in each of these bitmaps + * @nbits: number of bits in each of these bitmaps * * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst. * Clear all other bits in @dst. See further the comment and * Example [2] for bitmap_onto() for why and how to use this. */ void bitmap_fold(unsigned long *dst, const unsigned long *orig, - int sz, int bits) + unsigned int sz, unsigned int nbits) { - int oldbit; + unsigned int oldbit; if (dst == orig) /* following doesn't handle inplace mappings */ return; - bitmap_zero(dst, bits); + bitmap_zero(dst, nbits); - for_each_set_bit(oldbit, orig, bits) + for_each_set_bit(oldbit, orig, nbits) set_bit(oldbit % sz, dst); } EXPORT_SYMBOL(bitmap_fold); diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c index 0777c5a45fa0..f346715e2255 100644 --- a/lib/dynamic_queue_limits.c +++ b/lib/dynamic_queue_limits.c @@ -3,12 +3,12 @@ * * Copyright (c) 2011, Tom Herbert <therbert@google.com> */ -#include <linux/module.h> #include <linux/types.h> -#include <linux/ctype.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/dynamic_queue_limits.h> +#include <linux/compiler.h> +#include <linux/export.h> #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) diff --git a/lib/genalloc.c b/lib/genalloc.c index 2e65d206b01c..0fe1cbe87700 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -34,7 +34,6 @@ #include <linux/rculist.h> #include <linux/interrupt.h> #include <linux/genalloc.h> -#include <linux/of_address.h> #include <linux/of_device.h> static inline size_t chunk_size(const struct gen_pool_chunk *chunk) @@ -415,7 +414,7 @@ bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, size_t size) { bool found = false; - unsigned long end = start + size; + unsigned long end = start + size - 1; struct gen_pool_chunk *chunk; rcu_read_lock(); diff --git a/lib/halfmd4.c b/lib/halfmd4.c index 66d0ee8b7776..a8fe6274a13c 100644 --- a/lib/halfmd4.c +++ b/lib/halfmd4.c @@ -1,4 +1,4 @@ -#include <linux/kernel.h> +#include <linux/compiler.h> #include <linux/export.h> #include <linux/cryptohash.h> diff --git a/lib/hexdump.c b/lib/hexdump.c index 270773b91923..7ea09699855d 100644 --- a/lib/hexdump.c +++ b/lib/hexdump.c @@ -97,63 +97,79 @@ EXPORT_SYMBOL(bin2hex); * * example output buffer: * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO + * + * Return: + * The amount of bytes placed in the buffer without terminating NUL. If the + * output was truncated, then the return value is the number of bytes + * (excluding the terminating NUL) which would have been written to the final + * string if enough space had been available. */ -void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, - int groupsize, char *linebuf, size_t linebuflen, - bool ascii) +int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, + char *linebuf, size_t linebuflen, bool ascii) { const u8 *ptr = buf; + int ngroups; u8 ch; int j, lx = 0; int ascii_column; + int ret; if (rowsize != 16 && rowsize != 32) rowsize = 16; - if (!len) - goto nil; if (len > rowsize) /* limit to one line at a time */ len = rowsize; + if (!is_power_of_2(groupsize) || groupsize > 8) + groupsize = 1; if ((len % groupsize) != 0) /* no mixed size output */ groupsize = 1; - switch (groupsize) { - case 8: { - const u64 *ptr8 = buf; - int ngroups = len / groupsize; + ngroups = len / groupsize; + ascii_column = rowsize * 2 + rowsize / groupsize + 1; - for (j = 0; j < ngroups; j++) - lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%16.16llx", j ? " " : "", - (unsigned long long)*(ptr8 + j)); - ascii_column = 17 * ngroups + 2; - break; - } + if (!linebuflen) + goto overflow1; - case 4: { - const u32 *ptr4 = buf; - int ngroups = len / groupsize; + if (!len) + goto nil; - for (j = 0; j < ngroups; j++) - lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%8.8x", j ? " " : "", *(ptr4 + j)); - ascii_column = 9 * ngroups + 2; - break; - } + if (groupsize == 8) { + const u64 *ptr8 = buf; - case 2: { - const u16 *ptr2 = buf; - int ngroups = len / groupsize; + for (j = 0; j < ngroups; j++) { + ret = snprintf(linebuf + lx, linebuflen - lx, + "%s%16.16llx", j ? " " : "", + (unsigned long long)*(ptr8 + j)); + if (ret >= linebuflen - lx) + goto overflow1; + lx += ret; + } + } else if (groupsize == 4) { + const u32 *ptr4 = buf; - for (j = 0; j < ngroups; j++) - lx += scnprintf(linebuf + lx, linebuflen - lx, - "%s%4.4x", j ? " " : "", *(ptr2 + j)); - ascii_column = 5 * ngroups + 2; - break; - } + for (j = 0; j < ngroups; j++) { + ret = snprintf(linebuf + lx, linebuflen - lx, + "%s%8.8x", j ? " " : "", + *(ptr4 + j)); + if (ret >= linebuflen - lx) + goto overflow1; + lx += ret; + } + } else if (groupsize == 2) { + const u16 *ptr2 = buf; - default: - for (j = 0; (j < len) && (lx + 3) <= linebuflen; j++) { + for (j = 0; j < ngroups; j++) { + ret = snprintf(linebuf + lx, linebuflen - lx, + "%s%4.4x", j ? " " : "", + *(ptr2 + j)); + if (ret >= linebuflen - lx) + goto overflow1; + lx += ret; + } + } else { + for (j = 0; j < len; j++) { + if (linebuflen < lx + 3) + goto overflow2; ch = ptr[j]; linebuf[lx++] = hex_asc_hi(ch); linebuf[lx++] = hex_asc_lo(ch); @@ -161,21 +177,28 @@ void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, } if (j) lx--; - - ascii_column = 3 * rowsize + 2; - break; } if (!ascii) goto nil; - while (lx < (linebuflen - 1) && lx < (ascii_column - 1)) + while (lx < ascii_column) { + if (linebuflen < lx + 2) + goto overflow2; linebuf[lx++] = ' '; - for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) { + } + for (j = 0; j < len; j++) { + if (linebuflen < lx + 2) + goto overflow2; ch = ptr[j]; linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; } nil: + linebuf[lx] = '\0'; + return lx; +overflow2: linebuf[lx++] = '\0'; +overflow1: + return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; } EXPORT_SYMBOL(hex_dump_to_buffer); diff --git a/lib/idr.c b/lib/idr.c index e654aebd5f80..5335c43adf46 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -30,7 +30,6 @@ #include <linux/idr.h> #include <linux/spinlock.h> #include <linux/percpu.h> -#include <linux/hardirq.h> #define MAX_IDR_SHIFT (sizeof(int) * 8 - 1) #define MAX_IDR_BIT (1U << MAX_IDR_SHIFT) diff --git a/lib/interval_tree.c b/lib/interval_tree.c index f367f9ad544c..c85f6600a5f8 100644 --- a/lib/interval_tree.c +++ b/lib/interval_tree.c @@ -1,7 +1,7 @@ -#include <linux/init.h> #include <linux/interval_tree.h> #include <linux/interval_tree_generic.h> -#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/export.h> #define START(node) ((node)->start) #define LAST(node) ((node)->last) diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index 9ebf9e20de53..f6c2c1e7779c 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -20,7 +20,6 @@ #include <linux/export.h> #include <linux/kmod.h> #include <linux/slab.h> -#include <linux/user_namespace.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/netlink.h> diff --git a/lib/lcm.c b/lib/lcm.c index 51cc6b13cd52..e97dbd51e756 100644 --- a/lib/lcm.c +++ b/lib/lcm.c @@ -1,4 +1,4 @@ -#include <linux/kernel.h> +#include <linux/compiler.h> #include <linux/gcd.h> #include <linux/export.h> #include <linux/lcm.h> diff --git a/lib/list_sort.c b/lib/list_sort.c index 12bcba1c8612..b29015102698 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -2,9 +2,11 @@ #define pr_fmt(fmt) "list_sort_test: " fmt #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/bug.h> +#include <linux/compiler.h> +#include <linux/export.h> +#include <linux/string.h> #include <linux/list_sort.h> -#include <linux/slab.h> #include <linux/list.h> #define MAX_LIST_LENGTH_BITS 20 @@ -146,6 +148,7 @@ EXPORT_SYMBOL(list_sort); #ifdef CONFIG_TEST_LIST_SORT +#include <linux/slab.h> #include <linux/random.h> /* diff --git a/lib/llist.c b/lib/llist.c index f76196d07409..0b0e9779d675 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -24,7 +24,6 @@ */ #include <linux/kernel.h> #include <linux/export.h> -#include <linux/interrupt.h> #include <linux/llist.h> diff --git a/lib/md5.c b/lib/md5.c index 958a3c15923c..bb0cd01d356d 100644 --- a/lib/md5.c +++ b/lib/md5.c @@ -1,4 +1,4 @@ -#include <linux/kernel.h> +#include <linux/compiler.h> #include <linux/export.h> #include <linux/cryptohash.h> diff --git a/lib/nlattr.c b/lib/nlattr.c index 9c3e85ff0a6c..76a1b59523ab 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -9,7 +9,6 @@ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> -#include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index 93d145e5539c..f75715131f20 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -19,13 +19,10 @@ #include <linux/bug.h> #include <linux/err.h> #include <linux/export.h> -#include <linux/hardirq.h> -#include <linux/idr.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/sched.h> -#include <linux/slab.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/percpu_ida.h> diff --git a/lib/plist.c b/lib/plist.c index d408e774b746..3a30c53db061 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -25,7 +25,6 @@ #include <linux/bug.h> #include <linux/plist.h> -#include <linux/spinlock.h> #ifdef CONFIG_DEBUG_PI_LIST diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3291a8e37490..3d2aa27b845b 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -33,7 +33,7 @@ #include <linux/string.h> #include <linux/bitops.h> #include <linux/rcupdate.h> -#include <linux/hardirq.h> /* in_interrupt() */ +#include <linux/preempt_mask.h> /* in_interrupt() */ /* diff --git a/lib/show_mem.c b/lib/show_mem.c index 7de89f4a36cf..adc98e1825ba 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -6,7 +6,6 @@ */ #include <linux/mm.h> -#include <linux/nmi.h> #include <linux/quicklist.h> #include <linux/cma.h> diff --git a/lib/sort.c b/lib/sort.c index 926d00429ed2..43c9fe73ae2e 100644 --- a/lib/sort.c +++ b/lib/sort.c @@ -4,10 +4,9 @@ * Jan 23 2005 Matt Mackall <mpm@selenic.com> */ -#include <linux/kernel.h> -#include <linux/module.h> +#include <linux/types.h> +#include <linux/export.h> #include <linux/sort.h> -#include <linux/slab.h> static void u32_swap(void *a, void *b, int size) { @@ -85,6 +84,7 @@ void sort(void *base, size_t num, size_t size, EXPORT_SYMBOL(sort); #if 0 +#include <linux/slab.h> /* a simple boot-time regression test */ int cmpint(const void *a, const void *b) diff --git a/lib/stmp_device.c b/lib/stmp_device.c index 8ac9bcc4289a..a904656f4fd7 100644 --- a/lib/stmp_device.c +++ b/lib/stmp_device.c @@ -15,7 +15,8 @@ #include <linux/io.h> #include <linux/errno.h> #include <linux/delay.h> -#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/export.h> #include <linux/stmp_device.h> #define STMP_MODULE_CLKGATE (1 << 30) diff --git a/lib/string.c b/lib/string.c index 10063300b830..3206d0178296 100644 --- a/lib/string.c +++ b/lib/string.c @@ -58,14 +58,6 @@ int strncasecmp(const char *s1, const char *s2, size_t len) } EXPORT_SYMBOL(strncasecmp); #endif -#ifndef __HAVE_ARCH_STRNICMP -#undef strnicmp -int strnicmp(const char *s1, const char *s2, size_t len) -{ - return strncasecmp(s1, s2, len); -} -EXPORT_SYMBOL(strnicmp); -#endif #ifndef __HAVE_ARCH_STRCASECMP int strcasecmp(const char *s1, const char *s2) diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 58b78ba57439..8f8c4417f228 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -20,19 +20,18 @@ * @len: length of buffer * * This function returns a string formatted to 3 significant figures - * giving the size in the required units. Returns 0 on success or - * error on failure. @buf is always zero terminated. + * giving the size in the required units. @buf should have room for + * at least 9 bytes and will always be zero terminated. * */ -int string_get_size(u64 size, const enum string_size_units units, - char *buf, int len) +void string_get_size(u64 size, const enum string_size_units units, + char *buf, int len) { static const char *const units_10[] = { - "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", NULL + "B", "kB", "MB", "GB", "TB", "PB", "EB" }; static const char *const units_2[] = { - "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", - NULL + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB" }; static const char *const *const units_str[] = { [STRING_UNITS_10] = units_10, @@ -43,13 +42,13 @@ int string_get_size(u64 size, const enum string_size_units units, [STRING_UNITS_2] = 1024, }; int i, j; - u64 remainder = 0, sf_cap; + u32 remainder = 0, sf_cap; char tmp[8]; tmp[0] = '\0'; i = 0; if (size >= divisor[units]) { - while (size >= divisor[units] && units_str[units][i]) { + while (size >= divisor[units]) { remainder = do_div(size, divisor[units]); i++; } @@ -60,17 +59,14 @@ int string_get_size(u64 size, const enum string_size_units units, if (j) { remainder *= 1000; - do_div(remainder, divisor[units]); - snprintf(tmp, sizeof(tmp), ".%03lld", - (unsigned long long)remainder); + remainder /= divisor[units]; + snprintf(tmp, sizeof(tmp), ".%03u", remainder); tmp[j+1] = '\0'; } } - snprintf(buf, len, "%lld%s %s", (unsigned long long)size, + snprintf(buf, len, "%u%s %s", (u32)size, tmp, units_str[units][i]); - - return 0; } EXPORT_SYMBOL(string_get_size); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index bb2b201d6ad0..e0af6ff73d14 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,4 +1,5 @@ -#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/export.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> diff --git a/lib/test-hexdump.c b/lib/test-hexdump.c new file mode 100644 index 000000000000..daf29a390a89 --- /dev/null +++ b/lib/test-hexdump.c @@ -0,0 +1,180 @@ +/* + * Test cases for lib/hexdump.c module. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/random.h> +#include <linux/string.h> + +static const unsigned char data_b[] = { + '\xbe', '\x32', '\xdb', '\x7b', '\x0a', '\x18', '\x93', '\xb2', /* 00 - 07 */ + '\x70', '\xba', '\xc4', '\x24', '\x7d', '\x83', '\x34', '\x9b', /* 08 - 0f */ + '\xa6', '\x9c', '\x31', '\xad', '\x9c', '\x0f', '\xac', '\xe9', /* 10 - 17 */ + '\x4c', '\xd1', '\x19', '\x99', '\x43', '\xb1', '\xaf', '\x0c', /* 18 - 1f */ +}; + +static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C..."; + +static const char *test_data_1_le[] __initconst = { + "be", "32", "db", "7b", "0a", "18", "93", "b2", + "70", "ba", "c4", "24", "7d", "83", "34", "9b", + "a6", "9c", "31", "ad", "9c", "0f", "ac", "e9", + "4c", "d1", "19", "99", "43", "b1", "af", "0c", +}; + +static const char *test_data_2_le[] __initconst = { + "32be", "7bdb", "180a", "b293", + "ba70", "24c4", "837d", "9b34", + "9ca6", "ad31", "0f9c", "e9ac", + "d14c", "9919", "b143", "0caf", +}; + +static const char *test_data_4_le[] __initconst = { + "7bdb32be", "b293180a", "24c4ba70", "9b34837d", + "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", +}; + +static const char *test_data_8_le[] __initconst = { + "b293180a7bdb32be", "9b34837d24c4ba70", + "e9ac0f9cad319ca6", "0cafb1439919d14c", +}; + +static void __init test_hexdump(size_t len, int rowsize, int groupsize, + bool ascii) +{ + char test[32 * 3 + 2 + 32 + 1]; + char real[32 * 3 + 2 + 32 + 1]; + char *p; + const char **result; + size_t l = len; + int gs = groupsize, rs = rowsize; + unsigned int i; + + hex_dump_to_buffer(data_b, l, rs, gs, real, sizeof(real), ascii); + + if (rs != 16 && rs != 32) + rs = 16; + + if (l > rs) + l = rs; + + if (!is_power_of_2(gs) || gs > 8 || (len % gs != 0)) + gs = 1; + + if (gs == 8) + result = test_data_8_le; + else if (gs == 4) + result = test_data_4_le; + else if (gs == 2) + result = test_data_2_le; + else + result = test_data_1_le; + + memset(test, ' ', sizeof(test)); + + /* hex dump */ + p = test; + for (i = 0; i < l / gs; i++) { + const char *q = *result++; + size_t amount = strlen(q); + + strncpy(p, q, amount); + p += amount + 1; + } + if (i) + p--; + + /* ASCII part */ + if (ascii) { + p = test + rs * 2 + rs / gs + 1; + strncpy(p, data_a, l); + p += l; + } + + *p = '\0'; + + if (strcmp(test, real)) { + pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize); + pr_err("Result: '%s'\n", real); + pr_err("Expect: '%s'\n", test); + } +} + +static void __init test_hexdump_set(int rowsize, bool ascii) +{ + size_t d = min_t(size_t, sizeof(data_b), rowsize); + size_t len = get_random_int() % d + 1; + + test_hexdump(len, rowsize, 4, ascii); + test_hexdump(len, rowsize, 2, ascii); + test_hexdump(len, rowsize, 8, ascii); + test_hexdump(len, rowsize, 1, ascii); +} + +static void __init test_hexdump_overflow(bool ascii) +{ + char buf[56]; + const char *t = test_data_1_le[0]; + size_t l = get_random_int() % sizeof(buf); + bool a; + int e, r; + + memset(buf, ' ', sizeof(buf)); + + r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii); + + if (ascii) + e = 50; + else + e = 2; + buf[e + 2] = '\0'; + + if (!l) { + a = r == e && buf[0] == ' '; + } else if (l < 3) { + a = r == e && buf[0] == '\0'; + } else if (l < 4) { + a = r == e && !strcmp(buf, t); + } else if (ascii) { + if (l < 51) + a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' '; + else + a = r == e && buf[50] == '\0' && buf[49] == '.'; + } else { + a = r == e && buf[e] == '\0'; + } + + if (!a) { + pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf)); + pr_err("Result: '%s'\n", buf); + } +} + +static int __init test_hexdump_init(void) +{ + unsigned int i; + int rowsize; + + pr_info("Running tests...\n"); + + rowsize = (get_random_int() % 2 + 1) * 16; + for (i = 0; i < 16; i++) + test_hexdump_set(rowsize, false); + + rowsize = (get_random_int() % 2 + 1) * 16; + for (i = 0; i < 16; i++) + test_hexdump_set(rowsize, true); + + for (i = 0; i < 16; i++) + test_hexdump_overflow(false); + + for (i = 0; i < 16; i++) + test_hexdump_overflow(true); + + return -EINVAL; +} +module_init(test_hexdump_init); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index ec337f64f52d..602d2081e713 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -114,8 +114,9 @@ int skip_atoi(const char **s) { int i = 0; - while (isdigit(**s)) + do { i = i*10 + *((*s)++) - '0'; + } while (isdigit(**s)); return i; } @@ -1604,8 +1605,7 @@ qualifier: case 'p': spec->type = FORMAT_TYPE_PTR; - return fmt - start; - /* skip alnum */ + return ++fmt - start; case '%': spec->type = FORMAT_TYPE_PERCENT_CHAR; @@ -1728,7 +1728,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) /* Reject out-of-range values early. Large positive sizes are used for unknown buffer sizes. */ - if (WARN_ON_ONCE((int) size < 0)) + if (WARN_ON_ONCE(size > INT_MAX)) return 0; str = buf; @@ -1794,7 +1794,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) break; case FORMAT_TYPE_PTR: - str = pointer(fmt+1, str, end, va_arg(args, void *), + str = pointer(fmt, str, end, va_arg(args, void *), spec); while (isalnum(*fmt)) fmt++; @@ -2232,7 +2232,7 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) } case FORMAT_TYPE_PTR: - str = pointer(fmt+1, str, end, get_arg(void *), spec); + str = pointer(fmt, str, end, get_arg(void *), spec); while (isalnum(*fmt)) fmt++; break; diff --git a/mm/Kconfig b/mm/Kconfig index 4395b12869c8..de5239c152f9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -602,6 +602,16 @@ config PGTABLE_MAPPING You can check speed with zsmalloc benchmark: https://github.com/spartacus06/zsmapbench +config ZSMALLOC_STAT + bool "Export zsmalloc statistics" + depends on ZSMALLOC + select DEBUG_FS + help + This option enables code in the zsmalloc to collect various + statistics about whats happening in zsmalloc and exports that + information to userspace via debugfs. + If unsure, say N. + config GENERIC_EARLY_IOREMAP bool diff --git a/mm/compaction.c b/mm/compaction.c index b68736c8a1ce..d50d6de6f1b6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -490,6 +490,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* If a page was split, advance to the end of it */ if (isolated) { + cc->nr_freepages += isolated; + if (!strict && + cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; + } + blockpfn += isolated - 1; cursor += isolated - 1; continue; @@ -899,7 +906,6 @@ static void isolate_freepages(struct compact_control *cc) unsigned long isolate_start_pfn; /* exact pfn we start at */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; /* @@ -924,11 +930,11 @@ static void isolate_freepages(struct compact_control *cc) * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + for (; block_start_pfn >= low_pfn && + cc->nr_migratepages > cc->nr_freepages; block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { - unsigned long isolated; /* * This can iterate a massively long zone without finding any @@ -953,9 +959,8 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolated = isolate_freepages_block(cc, &isolate_start_pfn, + isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, freelist, false); - nr_freepages += isolated; /* * Remember where the free scanner should restart next time, @@ -987,8 +992,6 @@ static void isolate_freepages(struct compact_control *cc) */ if (block_start_pfn < low_pfn) cc->free_pfn = cc->migrate_pfn; - - cc->nr_freepages = nr_freepages; } /* @@ -1100,8 +1103,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, isolate_mode); - if (!low_pfn || cc->contended) + if (!low_pfn || cc->contended) { + acct_isolated(zone, cc); return ISOLATE_ABORT; + } /* * Either we isolated something and proceed with migration. Or @@ -1173,7 +1178,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_PARTIAL; /* Job done if allocation would set block type */ - if (cc->order >= pageblock_order && area->nr_free) + if (order >= pageblock_order && area->nr_free) return COMPACT_PARTIAL; } @@ -64,7 +64,7 @@ retry: migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_numa(pte)) + if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); @@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { @@ -906,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, /* * Similar to the PMD case below, NUMA hinting must take slow - * path + * path using the pte_protnone check. */ if (!pte_present(pte) || pte_special(pte) || - pte_numa(pte) || (write && !pte_write(pte))) + pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -1104,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, write, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cb7be110cad3..fc00c8cb5a82 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1211,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) goto out; page = pmd_page(*pmd); @@ -1262,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool migrated = false; int flags = 0; + /* A PROT_NONE fault should not end up here */ + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); + ptl = pmd_lock(mm, pmdp); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; @@ -1272,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * check_same as the page may no longer be mapped. */ if (unlikely(pmd_trans_migrating(*pmdp))) { + page = pmd_page(*pmdp); spin_unlock(ptl); - wait_migrate_huge_page(vma->anon_vma, pmdp); + wait_on_page_locked(page); goto out; } @@ -1341,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Migrate the THP to the requested node, returns with page unlocked - * and pmd_numa cleared. + * and access rights restored. */ spin_unlock(ptl); migrated = migrate_misplaced_transhuge_page(mm, vma, @@ -1354,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - pmd = pmd_mknonnuma(pmd); + pmd = pmd_modify(pmd, vma->vm_page_prot); set_pmd_at(mm, haddr, pmdp, pmd); - VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: @@ -1479,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; - ret = 1; - if (!prot_numa) { + + /* + * Avoid trapping faults against the zero page. The read-only + * data is likely to be read-cached on the local CPU and + * local/remote hits to the zero page are not interesting. + */ + if (prot_numa && is_huge_zero_pmd(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!prot_numa || !pmd_protnone(*pmd)) { + ret = 1; entry = pmdp_get_and_clear_notify(mm, addr, pmd); - if (pmd_numa(entry)) - entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); - } else { - struct page *page = pmd_page(*pmd); - - /* - * Do not trap faults against the zero page. The - * read-only data is likely to be read-cached on the - * local CPU cache and it is less useful to know about - * local vs remote hits on the zero page. - */ - if (!is_huge_zero_page(page) && - !pmd_numa(*pmd)) { - pmdp_set_numa(mm, addr, pmd); - ret = HPAGE_PMD_NR; - } } spin_unlock(ptl); } @@ -1766,9 +1764,9 @@ static int __split_huge_page_map(struct page *page, pte_t *pte, entry; BUG_ON(PageCompound(page+i)); /* - * Note that pmd_numa is not transferred deliberately - * to avoid any possibility that pte_numa leaks to - * a PROT_NONE VMA by accident. + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. */ entry = mk_pte(page + i, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); diff --git a/mm/internal.h b/mm/internal.h index c4d6c9b43491..a96da5b0029d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -351,8 +351,10 @@ extern int mminit_loglevel; #define mminit_dprintk(level, prefix, fmt, arg...) \ do { \ if (level < mminit_loglevel) { \ - printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ - printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ + if (level <= MMINIT_WARNING) \ + printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ + else \ + printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ } while (0) diff --git a/mm/list_lru.c b/mm/list_lru.c index f1a0db194173..909eca2c820e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -9,18 +9,100 @@ #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/memcontrol.h> + +#ifdef CONFIG_MEMCG_KMEM +static LIST_HEAD(list_lrus); +static DEFINE_MUTEX(list_lrus_mutex); + +static void list_lru_register(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_add(&lru->list, &list_lrus); + mutex_unlock(&list_lrus_mutex); +} + +static void list_lru_unregister(struct list_lru *lru) +{ + mutex_lock(&list_lrus_mutex); + list_del(&lru->list); + mutex_unlock(&list_lrus_mutex); +} +#else +static void list_lru_register(struct list_lru *lru) +{ +} + +static void list_lru_unregister(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +#ifdef CONFIG_MEMCG_KMEM +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return !!lru->node[0].memcg_lrus; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + /* + * The lock protects the array of per cgroup lists from relocation + * (see memcg_update_list_lru_node). + */ + lockdep_assert_held(&nlru->lock); + if (nlru->memcg_lrus && idx >= 0) + return nlru->memcg_lrus->lru[idx]; + + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + struct mem_cgroup *memcg; + + if (!nlru->memcg_lrus) + return &nlru->lru; + + memcg = mem_cgroup_from_kmem(ptr); + if (!memcg) + return &nlru->lru; + + return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); +} +#else +static inline bool list_lru_memcg_aware(struct list_lru *lru) +{ + return false; +} + +static inline struct list_lru_one * +list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) +{ + return &nlru->lru; +} + +static inline struct list_lru_one * +list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) +{ + return &nlru->lru; +} +#endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; spin_lock(&nlru->lock); - WARN_ON_ONCE(nlru->nr_items < 0); + l = list_lru_from_kmem(nlru, item); if (list_empty(item)) { - list_add_tail(item, &nlru->list); - if (nlru->nr_items++ == 0) - node_set(nid, lru->active_nodes); + list_add_tail(item, &l->list); + l->nr_items++; spin_unlock(&nlru->lock); return true; } @@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; spin_lock(&nlru->lock); + l = list_lru_from_kmem(nlru, item); if (!list_empty(item)) { list_del_init(item); - if (--nlru->nr_items == 0) - node_clear(nid, lru->active_nodes); - WARN_ON_ONCE(nlru->nr_items < 0); + l->nr_items--; spin_unlock(&nlru->lock); return true; } @@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) } EXPORT_SYMBOL_GPL(list_lru_del); -unsigned long -list_lru_count_node(struct list_lru *lru, int nid) +void list_lru_isolate(struct list_lru_one *list, struct list_head *item) +{ + list_del_init(item); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate); + +void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, + struct list_head *head) +{ + list_move(item, head); + list->nr_items--; +} +EXPORT_SYMBOL_GPL(list_lru_isolate_move); + +static unsigned long __list_lru_count_one(struct list_lru *lru, + int nid, int memcg_idx) { - unsigned long count = 0; struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; + unsigned long count; spin_lock(&nlru->lock); - WARN_ON_ONCE(nlru->nr_items < 0); - count += nlru->nr_items; + l = list_lru_from_memcg_idx(nlru, memcg_idx); + count = l->nr_items; spin_unlock(&nlru->lock); return count; } + +unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) +{ + return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); +} +EXPORT_SYMBOL_GPL(list_lru_count_one); + +unsigned long list_lru_count_node(struct list_lru *lru, int nid) +{ + long count = 0; + int memcg_idx; + + count += __list_lru_count_one(lru, nid, -1); + if (list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) + count += __list_lru_count_one(lru, nid, memcg_idx); + } + return count; +} EXPORT_SYMBOL_GPL(list_lru_count_node); -unsigned long -list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, - void *cb_arg, unsigned long *nr_to_walk) +static unsigned long +__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_node *nlru = &lru->node[nid]; + struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; spin_lock(&nlru->lock); + l = list_lru_from_memcg_idx(nlru, memcg_idx); restart: - list_for_each_safe(item, n, &nlru->list) { + list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* @@ -85,14 +206,11 @@ restart: break; --*nr_to_walk; - ret = isolate(item, &nlru->lock, cb_arg); + ret = isolate(item, l, &nlru->lock, cb_arg); switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); case LRU_REMOVED: - if (--nlru->nr_items == 0) - node_clear(nid, lru->active_nodes); - WARN_ON_ONCE(nlru->nr_items < 0); isolated++; /* * If the lru lock has been dropped, our list @@ -103,7 +221,7 @@ restart: goto restart; break; case LRU_ROTATE: - list_move_tail(item, &nlru->list); + list_move_tail(item, &l->list); break; case LRU_SKIP: break; @@ -122,31 +240,322 @@ restart: spin_unlock(&nlru->lock); return isolated; } + +unsigned long +list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), + isolate, cb_arg, nr_to_walk); +} +EXPORT_SYMBOL_GPL(list_lru_walk_one); + +unsigned long list_lru_walk_node(struct list_lru *lru, int nid, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long *nr_to_walk) +{ + long isolated = 0; + int memcg_idx; + + isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, + nr_to_walk); + if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { + for_each_memcg_cache_index(memcg_idx) { + isolated += __list_lru_walk_one(lru, nid, memcg_idx, + isolate, cb_arg, nr_to_walk); + if (*nr_to_walk <= 0) + break; + } + } + return isolated; +} EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) +static void init_one_lru(struct list_lru_one *l) +{ + INIT_LIST_HEAD(&l->list); + l->nr_items = 0; +} + +#ifdef CONFIG_MEMCG_KMEM +static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) + kfree(memcg_lrus->lru[i]); +} + +static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, + int begin, int end) +{ + int i; + + for (i = begin; i < end; i++) { + struct list_lru_one *l; + + l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); + if (!l) + goto fail; + + init_one_lru(l); + memcg_lrus->lru[i] = l; + } + return 0; +fail: + __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); + return -ENOMEM; +} + +static int memcg_init_list_lru_node(struct list_lru_node *nlru) +{ + int size = memcg_nr_cache_ids; + + nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); + if (!nlru->memcg_lrus) + return -ENOMEM; + + if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { + kfree(nlru->memcg_lrus); + return -ENOMEM; + } + + return 0; +} + +static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) +{ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); + kfree(nlru->memcg_lrus); +} + +static int memcg_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + struct list_lru_memcg *old, *new; + + BUG_ON(old_size > new_size); + + old = nlru->memcg_lrus; + new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); + if (!new) + return -ENOMEM; + + if (__memcg_init_list_lru_node(new, old_size, new_size)) { + kfree(new); + return -ENOMEM; + } + + memcpy(new, old, old_size * sizeof(void *)); + + /* + * The lock guarantees that we won't race with a reader + * (see list_lru_from_memcg_idx). + * + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + nlru->memcg_lrus = new; + spin_unlock_irq(&nlru->lock); + + kfree(old); + return 0; +} + +static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, + int old_size, int new_size) +{ + /* do not bother shrinking the array back to the old size, because we + * cannot handle allocation failures here */ + __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); +} + +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + int i; + + for (i = 0; i < nr_node_ids; i++) { + if (!memcg_aware) + lru->node[i].memcg_lrus = NULL; + else if (memcg_init_list_lru_node(&lru->node[i])) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_destroy_list_lru_node(&lru->node[i]); + return -ENOMEM; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_destroy_list_lru_node(&lru->node[i]); +} + +static int memcg_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return 0; + + for (i = 0; i < nr_node_ids; i++) { + if (memcg_update_list_lru_node(&lru->node[i], + old_size, new_size)) + goto fail; + } + return 0; +fail: + for (i = i - 1; i >= 0; i--) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); + return -ENOMEM; +} + +static void memcg_cancel_update_list_lru(struct list_lru *lru, + int old_size, int new_size) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_cancel_update_list_lru_node(&lru->node[i], + old_size, new_size); +} + +int memcg_update_all_list_lrus(int new_size) +{ + int ret = 0; + struct list_lru *lru; + int old_size = memcg_nr_cache_ids; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) { + ret = memcg_update_list_lru(lru, old_size, new_size); + if (ret) + goto fail; + } +out: + mutex_unlock(&list_lrus_mutex); + return ret; +fail: + list_for_each_entry_continue_reverse(lru, &list_lrus, list) + memcg_cancel_update_list_lru(lru, old_size, new_size); + goto out; +} + +static void memcg_drain_list_lru_node(struct list_lru_node *nlru, + int src_idx, int dst_idx) +{ + struct list_lru_one *src, *dst; + + /* + * Since list_lru_{add,del} may be called under an IRQ-safe lock, + * we have to use IRQ-safe primitives here to avoid deadlock. + */ + spin_lock_irq(&nlru->lock); + + src = list_lru_from_memcg_idx(nlru, src_idx); + dst = list_lru_from_memcg_idx(nlru, dst_idx); + + list_splice_init(&src->list, &dst->list); + dst->nr_items += src->nr_items; + src->nr_items = 0; + + spin_unlock_irq(&nlru->lock); +} + +static void memcg_drain_list_lru(struct list_lru *lru, + int src_idx, int dst_idx) +{ + int i; + + if (!list_lru_memcg_aware(lru)) + return; + + for (i = 0; i < nr_node_ids; i++) + memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); +} + +void memcg_drain_all_list_lrus(int src_idx, int dst_idx) +{ + struct list_lru *lru; + + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &list_lrus, list) + memcg_drain_list_lru(lru, src_idx, dst_idx); + mutex_unlock(&list_lrus_mutex); +} +#else +static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) +{ + return 0; +} + +static void memcg_destroy_list_lru(struct list_lru *lru) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ + +int __list_lru_init(struct list_lru *lru, bool memcg_aware, + struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; + int err = -ENOMEM; + + memcg_get_cache_ids(); lru->node = kzalloc(size, GFP_KERNEL); if (!lru->node) - return -ENOMEM; + goto out; - nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); - INIT_LIST_HEAD(&lru->node[i].list); - lru->node[i].nr_items = 0; + init_one_lru(&lru->node[i].lru); } - return 0; + + err = memcg_init_list_lru(lru, memcg_aware); + if (err) { + kfree(lru->node); + goto out; + } + + list_lru_register(lru); +out: + memcg_put_cache_ids(); + return err; } -EXPORT_SYMBOL_GPL(list_lru_init_key); +EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { + /* Already destroyed or not yet initialized? */ + if (!lru->node) + return; + + memcg_get_cache_ids(); + + list_lru_unregister(lru); + + memcg_destroy_list_lru(lru); kfree(lru->node); + lru->node = NULL; + + memcg_put_cache_ids(); } EXPORT_SYMBOL_GPL(list_lru_destroy); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 095c1f96fbec..d18d3a6e7337 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -332,8 +332,10 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* Index in the kmem_cache->memcg_params->memcg_caches array */ + /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; + bool kmem_acct_activated; + bool kmem_acct_active; #endif int last_scanned_node; @@ -352,9 +354,9 @@ struct mem_cgroup { }; #ifdef CONFIG_MEMCG_KMEM -static bool memcg_kmem_is_active(struct mem_cgroup *memcg) +bool memcg_kmem_is_active(struct mem_cgroup *memcg) { - return memcg->kmemcg_id >= 0; + return memcg->kmem_acct_active; } #endif @@ -517,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(tcp_proto_cgroup); -static void disarm_sock_keys(struct mem_cgroup *memcg) -{ - if (!memcg_proto_activated(&memcg->tcp_mem)) - return; - static_key_slow_dec(&memcg_socket_limit_enabled); -} -#else -static void disarm_sock_keys(struct mem_cgroup *memcg) -{ -} #endif #ifdef CONFIG_MEMCG_KMEM /* - * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. + * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: * this works better in sparse environments, where we have a lot of memcgs, * but only a few kmem-limited. Or also, if we have, for instance, 200 * memcgs, and none but the 200th is kmem-limited, we'd have to have a * 200 entry array for that. * - * The current size of the caches array is stored in - * memcg_limited_groups_array_size. It will double each time we have to - * increase it. + * The current size of the caches array is stored in memcg_nr_cache_ids. It + * will double each time we have to increase it. */ -static DEFINE_IDA(kmem_limited_groups); -int memcg_limited_groups_array_size; +static DEFINE_IDA(memcg_cache_ida); +int memcg_nr_cache_ids; + +/* Protects memcg_nr_cache_ids */ +static DECLARE_RWSEM(memcg_cache_ids_sem); + +void memcg_get_cache_ids(void) +{ + down_read(&memcg_cache_ids_sem); +} + +void memcg_put_cache_ids(void) +{ + up_read(&memcg_cache_ids_sem); +} /* * MIN_SIZE is different than 1, because we would like to avoid going through @@ -569,32 +573,8 @@ int memcg_limited_groups_array_size; struct static_key memcg_kmem_enabled_key; EXPORT_SYMBOL(memcg_kmem_enabled_key); -static void memcg_free_cache_id(int id); - -static void disarm_kmem_keys(struct mem_cgroup *memcg) -{ - if (memcg_kmem_is_active(memcg)) { - static_key_slow_dec(&memcg_kmem_enabled_key); - memcg_free_cache_id(memcg->kmemcg_id); - } - /* - * This check can't live in kmem destruction function, - * since the charges will outlive the cgroup - */ - WARN_ON(page_counter_read(&memcg->kmem)); -} -#else -static void disarm_kmem_keys(struct mem_cgroup *memcg) -{ -} #endif /* CONFIG_MEMCG_KMEM */ -static void disarm_static_keys(struct mem_cgroup *memcg) -{ - disarm_sock_keys(memcg); - disarm_kmem_keys(memcg); -} - static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) { @@ -2538,18 +2518,19 @@ static int memcg_alloc_cache_id(void) int id, size; int err; - id = ida_simple_get(&kmem_limited_groups, + id = ida_simple_get(&memcg_cache_ida, 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); if (id < 0) return id; - if (id < memcg_limited_groups_array_size) + if (id < memcg_nr_cache_ids) return id; /* * There's no space for the new id in memcg_caches arrays, * so we have to grow them. */ + down_write(&memcg_cache_ids_sem); size = 2 * (id + 1); if (size < MEMCG_CACHES_MIN_SIZE) @@ -2558,8 +2539,15 @@ static int memcg_alloc_cache_id(void) size = MEMCG_CACHES_MAX_SIZE; err = memcg_update_all_caches(size); + if (!err) + err = memcg_update_all_list_lrus(size); + if (!err) + memcg_nr_cache_ids = size; + + up_write(&memcg_cache_ids_sem); + if (err) { - ida_simple_remove(&kmem_limited_groups, id); + ida_simple_remove(&memcg_cache_ida, id); return err; } return id; @@ -2567,17 +2555,7 @@ static int memcg_alloc_cache_id(void) static void memcg_free_cache_id(int id) { - ida_simple_remove(&kmem_limited_groups, id); -} - -/* - * We should update the current array size iff all caches updates succeed. This - * can only be done from the slab side. The slab mutex needs to be held when - * calling this. - */ -void memcg_update_array_size(int num) -{ - memcg_limited_groups_array_size = num; + ida_simple_remove(&memcg_cache_ida, id); } struct memcg_kmem_cache_create_work { @@ -2656,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; + int kmemcg_id; - VM_BUG_ON(!cachep->memcg_params); - VM_BUG_ON(!cachep->memcg_params->is_root_cache); + VM_BUG_ON(!is_root_cache(cachep)); if (current->memcg_kmem_skip_account) return cachep; memcg = get_mem_cgroup_from_mm(current->mm); - if (!memcg_kmem_is_active(memcg)) + kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); + if (kmemcg_id < 0) goto out; - memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); + memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); if (likely(memcg_cachep)) return memcg_cachep; @@ -2692,7 +2671,7 @@ out: void __memcg_kmem_put_cache(struct kmem_cache *cachep) { if (!is_root_cache(cachep)) - css_put(&cachep->memcg_params->memcg->css); + css_put(&cachep->memcg_params.memcg->css); } /* @@ -2757,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, 1 << order); page->mem_cgroup = NULL; } + +struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) +{ + struct mem_cgroup *memcg = NULL; + struct kmem_cache *cachep; + struct page *page; + + page = virt_to_head_page(ptr); + if (PageSlab(page)) { + cachep = page->slab_cache; + if (!is_root_cache(cachep)) + memcg = cachep->memcg_params.memcg; + } else + /* page allocated by alloc_kmem_pages */ + memcg = page->mem_cgroup; + + return memcg; +} #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -3291,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, int err = 0; int memcg_id; - if (memcg_kmem_is_active(memcg)) - return 0; + BUG_ON(memcg->kmemcg_id >= 0); + BUG_ON(memcg->kmem_acct_activated); + BUG_ON(memcg->kmem_acct_active); /* * For simplicity, we won't allow this to be disabled. It also can't @@ -3335,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, * patched. */ memcg->kmemcg_id = memcg_id; + memcg->kmem_acct_activated = true; + memcg->kmem_acct_active = true; out: return err; } @@ -4014,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(memcg, ss); } +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + + if (!memcg->kmem_acct_active) + return; + + /* + * Clear the 'active' flag before clearing memcg_caches arrays entries. + * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it + * guarantees no cache will be created for this cgroup after we are + * done (see memcg_create_kmem_cache()). + */ + memcg->kmem_acct_active = false; + + memcg_deactivate_kmem_caches(memcg); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). + */ + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); +} + static void memcg_destroy_kmem(struct mem_cgroup *memcg) { - memcg_destroy_kmem_caches(memcg); + if (memcg->kmem_acct_activated) { + memcg_destroy_kmem_caches(memcg); + static_key_slow_dec(&memcg_kmem_enabled_key); + WARN_ON(page_counter_read(&memcg->kmem)); + } mem_cgroup_sockets_destroy(memcg); } #else @@ -4025,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } +static void memcg_deactivate_kmem(struct mem_cgroup *memcg) +{ +} + static void memcg_destroy_kmem(struct mem_cgroup *memcg) { } @@ -4443,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); - - disarm_static_keys(memcg); kfree(memcg); } @@ -4581,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) spin_unlock(&memcg->event_list_lock); vmpressure_cleanup(&memcg->vmpressure); + + memcg_deactivate_kmem(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index feb803bf3443..d487f8dc6d39 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -242,15 +242,8 @@ void shake_page(struct page *p, int access) * Only call shrink_node_slabs here (which would also shrink * other caches) if access is not potentially fatal. */ - if (access) { - int nr; - int nid = page_to_nid(p); - do { - nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); - if (page_count(p) == 1) - break; - } while (nr > 10); - } + if (access) + drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); @@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags) * setting PG_hwpoison. */ if (!is_free_buddy_page(page)) - lru_add_drain_all(); - if (!is_free_buddy_page(page)) drain_all_pages(page_zone(page)); SetPageHWPoison(page); if (!is_free_buddy_page(page)) diff --git a/mm/memory.c b/mm/memory.c index bbe6a73a899d..99275325f303 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3013,14 +3013,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, bool migrated = false; int flags = 0; + /* A PROT_NONE fault should not end up here */ + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); + /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. * - * ptep_modify_prot_start is not called as this is clearing - * the _PAGE_NUMA bit and it is not really expected that there - * would be concurrent hardware modifications to the PTE. + * We can safely just do a "set_pte_at()", because the old + * page table entry is not accessible, so there would be no + * concurrent hardware modifications to the PTE. */ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); @@ -3029,7 +3032,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } - pte = pte_mknonnuma(pte); + /* Make it present again */ + pte = pte_modify(pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3038,7 +3043,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(ptep, ptl); return 0; } - BUG_ON(is_zero_pfn(page_to_pfn(page))); /* * Avoid grouping on DSO/COW pages in specific and RO pages @@ -3124,7 +3128,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } - if (pte_numa(entry)) + if (pte_protnone(entry)) return do_numa_page(mm, vma, address, entry, pte, pmd); ptl = pte_lockptr(mm, pmd); @@ -3202,7 +3206,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (pmd_trans_splitting(orig_pmd)) return 0; - if (pmd_numa(orig_pmd)) + if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); @@ -3458,7 +3462,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, if (follow_phys(vma, addr, write, &prot, &phys_addr)) return -EINVAL; - maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (write) memcpy_toio(maddr + offset, buf, len); else diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f1bd23803576..c75f4dcec808 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); diff --git a/mm/migrate.c b/mm/migrate.c index f98067e5d353..85e042686031 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } -void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) -{ - struct page *page = pmd_page(*pmd); - wait_on_page_locked(page); -} - /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -1853,7 +1847,7 @@ out_fail: out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { - entry = pmd_mknonnuma(entry); + entry = pmd_modify(entry, vma->vm_page_prot); set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } diff --git a/mm/mm_init.c b/mm/mm_init.c index 4074caf9936b..5f420f7fafa1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -14,14 +14,14 @@ #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT -int mminit_loglevel; +int __meminitdata mminit_loglevel; #ifndef SECTIONS_SHIFT #define SECTIONS_SHIFT 0 #endif /* The zonelists are simply reported, validation is manual. */ -void mminit_verify_zonelist(void) +void __init mminit_verify_zonelist(void) { int nid; diff --git a/mm/mprotect.c b/mm/mprotect.c index 33121662f08b..44727811bf4c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -75,36 +75,34 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool updated = false; - if (!prot_numa) { - ptent = ptep_modify_prot_start(mm, addr, pte); - if (pte_numa(ptent)) - ptent = pte_mknonnuma(ptent); - ptent = pte_modify(ptent, newprot); - /* - * Avoid taking write faults for pages we - * know to be dirty. - */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) - ptent = pte_mkwrite(ptent); - ptep_modify_prot_commit(mm, addr, pte, ptent); - updated = true; - } else { + /* + * Avoid trapping faults against the zero or KSM + * pages. See similar comment in change_huge_pmd. + */ + if (prot_numa) { struct page *page; page = vm_normal_page(vma, addr, oldpte); - if (page && !PageKsm(page)) { - if (!pte_numa(oldpte)) { - ptep_set_numa(mm, addr, pte); - updated = true; - } - } + if (!page || PageKsm(page)) + continue; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + continue; } - if (updated) - pages++; + + ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = pte_modify(ptent, newprot); + + /* Avoid taking write faults for known dirty pages */ + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) { + ptent = pte_mkwrite(ptent); + } + ptep_modify_prot_commit(mm, addr, pte, ptent); + pages++; } else if (IS_ENABLED(CONFIG_MIGRATION)) { swp_entry_t entry = pte_to_swp_entry(oldpte); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d52ab18fe0d..cb4758263f6b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -172,7 +172,7 @@ static void __free_pages_ok(struct page *page, unsigned int order); * 1G machine -> (16M dma, 784M normal, 224M high) * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL - * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA * * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation @@ -3871,18 +3871,29 @@ static int __build_all_zonelists(void *data) return 0; } +static noinline void __init +build_all_zonelists_init(void) +{ + __build_all_zonelists(NULL); + mminit_verify_zonelist(); + cpuset_init_current_mems_allowed(); +} + /* * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. + * + * __ref due to (1) call of __meminit annotated setup_zone_pageset + * [we're only called with non-NULL zone through __meminit paths] and + * (2) call of __init annotated helper build_all_zonelists_init + * [protected by SYSTEM_BOOTING]. */ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); if (system_state == SYSTEM_BOOTING) { - __build_all_zonelists(NULL); - mminit_verify_zonelist(); - cpuset_init_current_mems_allowed(); + build_all_zonelists_init(); } else { #ifdef CONFIG_MEMORY_HOTPLUG if (zone) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index dfb79e028ecb..c25f94b33811 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t entry = *pmdp; - if (pmd_numa(entry)) - entry = pmd_mknonnuma(entry); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } diff --git a/mm/slab.c b/mm/slab.c index 65b5dcb6f671..c4b89eaf4c96 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2382,7 +2382,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) { int ret = 0; int node; @@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) { int i; struct kmem_cache_node *n; - int rc = __kmem_cache_shrink(cachep); + int rc = __kmem_cache_shrink(cachep, false); if (rc) return rc; @@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared, gfp_t gfp) { int ret; - struct kmem_cache *c = NULL; - int i = 0; + struct kmem_cache *c; ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); @@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, if ((ret < 0) || !is_root_cache(cachep)) return ret; - VM_BUG_ON(!mutex_is_locked(&slab_mutex)); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(cachep, i); - if (c) - /* return value determined by the parent cache only */ - __do_tune_cpucache(c, limit, batchcount, shared, gfp); + lockdep_assert_held(&slab_mutex); + for_each_memcg_cache(c, cachep) { + /* return value determined by the root cache only */ + __do_tune_cpucache(c, limit, batchcount, shared, gfp); } return ret; diff --git a/mm/slab.h b/mm/slab.h index 90430d6f665e..4c3ac12dd644 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, extern void create_boot_cache(struct kmem_cache *, const char *name, size_t size, unsigned long flags); -struct mem_cgroup; - int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, void (*ctor)(void *)); @@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *, bool); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; @@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); #ifdef CONFIG_MEMCG_KMEM +/* + * Iterate over all memcg caches of the given root cache. The caller must hold + * slab_mutex. + */ +#define for_each_memcg_cache(iter, root) \ + list_for_each_entry(iter, &(root)->memcg_params.list, \ + memcg_params.list) + +#define for_each_memcg_cache_safe(iter, tmp, root) \ + list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ + memcg_params.list) + static inline bool is_root_cache(struct kmem_cache *s) { - return !s->memcg_params || s->memcg_params->is_root_cache; + return s->memcg_params.is_root_cache; } static inline bool slab_equal_or_root(struct kmem_cache *s, - struct kmem_cache *p) + struct kmem_cache *p) { - return (p == s) || - (s->memcg_params && (p == s->memcg_params->root_cache)); + return p == s || p == s->memcg_params.root_cache; } /* @@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, static inline const char *cache_name(struct kmem_cache *s) { if (!is_root_cache(s)) - return s->memcg_params->root_cache->name; + s = s->memcg_params.root_cache; return s->name; } /* * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. - * That said the caller must assure the memcg's cache won't go away. Since once - * created a memcg's cache is destroyed only along with the root cache, it is - * true if we are going to allocate from the cache or hold a reference to the - * root cache by other means. Otherwise, we should hold either the slab_mutex - * or the memcg's slab_caches_mutex while calling this function and accessing - * the returned value. + * That said the caller must assure the memcg's cache won't go away by either + * taking a css reference to the owner cgroup, or holding the slab_mutex. */ static inline struct kmem_cache * cache_from_memcg_idx(struct kmem_cache *s, int idx) { struct kmem_cache *cachep; - struct memcg_cache_params *params; - - if (!s->memcg_params) - return NULL; + struct memcg_cache_array *arr; rcu_read_lock(); - params = rcu_dereference(s->memcg_params); + arr = rcu_dereference(s->memcg_params.memcg_caches); /* * Make sure we will access the up-to-date value. The code updating * memcg_caches issues a write barrier to match this (see - * memcg_register_cache()). + * memcg_create_kmem_cache()). */ - cachep = lockless_dereference(params->memcg_caches[idx]); + cachep = lockless_dereference(arr->entries[idx]); rcu_read_unlock(); return cachep; @@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { if (is_root_cache(s)) return s; - return s->memcg_params->root_cache; + return s->memcg_params.root_cache; } static __always_inline int memcg_charge_slab(struct kmem_cache *s, @@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); + return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); + memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); } -#else + +extern void slab_init_memcg_params(struct kmem_cache *); + +#else /* !CONFIG_MEMCG_KMEM */ + +#define for_each_memcg_cache(iter, root) \ + for ((void)(iter), (void)(root); 0; ) +#define for_each_memcg_cache_safe(iter, tmp, root) \ + for ((void)(iter), (void)(tmp), (void)(root); 0; ) + static inline bool is_root_cache(struct kmem_cache *s) { return true; @@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) { } -#endif + +static inline void slab_init_memcg_params(struct kmem_cache *s) +{ +} +#endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { diff --git a/mm/slab_common.c b/mm/slab_common.c index 6e1e4cf65836..1a1cc89acaa3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) #endif #ifdef CONFIG_MEMCG_KMEM -static int memcg_alloc_cache_params(struct mem_cgroup *memcg, - struct kmem_cache *s, struct kmem_cache *root_cache) +void slab_init_memcg_params(struct kmem_cache *s) { - size_t size; + s->memcg_params.is_root_cache = true; + INIT_LIST_HEAD(&s->memcg_params.list); + RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); +} + +static int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct memcg_cache_array *arr; - if (!memcg_kmem_enabled()) + if (memcg) { + s->memcg_params.is_root_cache = false; + s->memcg_params.memcg = memcg; + s->memcg_params.root_cache = root_cache; return 0; + } - if (!memcg) { - size = offsetof(struct memcg_cache_params, memcg_caches); - size += memcg_limited_groups_array_size * sizeof(void *); - } else - size = sizeof(struct memcg_cache_params); + slab_init_memcg_params(s); - s->memcg_params = kzalloc(size, GFP_KERNEL); - if (!s->memcg_params) - return -ENOMEM; + if (!memcg_nr_cache_ids) + return 0; - if (memcg) { - s->memcg_params->memcg = memcg; - s->memcg_params->root_cache = root_cache; - } else - s->memcg_params->is_root_cache = true; + arr = kzalloc(sizeof(struct memcg_cache_array) + + memcg_nr_cache_ids * sizeof(void *), + GFP_KERNEL); + if (!arr) + return -ENOMEM; + RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr); return 0; } -static void memcg_free_cache_params(struct kmem_cache *s) +static void destroy_memcg_params(struct kmem_cache *s) { - kfree(s->memcg_params); + if (is_root_cache(s)) + kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); } -static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) +static int update_memcg_params(struct kmem_cache *s, int new_array_size) { - int size; - struct memcg_cache_params *new_params, *cur_params; - - BUG_ON(!is_root_cache(s)); + struct memcg_cache_array *old, *new; - size = offsetof(struct memcg_cache_params, memcg_caches); - size += num_memcgs * sizeof(void *); + if (!is_root_cache(s)) + return 0; - new_params = kzalloc(size, GFP_KERNEL); - if (!new_params) + new = kzalloc(sizeof(struct memcg_cache_array) + + new_array_size * sizeof(void *), GFP_KERNEL); + if (!new) return -ENOMEM; - cur_params = s->memcg_params; - memcpy(new_params->memcg_caches, cur_params->memcg_caches, - memcg_limited_groups_array_size * sizeof(void *)); - - new_params->is_root_cache = true; - - rcu_assign_pointer(s->memcg_params, new_params); - if (cur_params) - kfree_rcu(cur_params, rcu_head); + old = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + if (old) + memcpy(new->entries, old->entries, + memcg_nr_cache_ids * sizeof(void *)); + rcu_assign_pointer(s->memcg_params.memcg_caches, new); + if (old) + kfree_rcu(old, rcu); return 0; } @@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs) { struct kmem_cache *s; int ret = 0; - mutex_lock(&slab_mutex); + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - if (!is_root_cache(s)) - continue; - - ret = memcg_update_cache_params(s, num_memcgs); + ret = update_memcg_params(s, num_memcgs); /* * Instead of freeing the memory, we'll just leave the caches * up to this point in an updated state. */ if (ret) - goto out; + break; } - - memcg_update_array_size(num_memcgs); -out: mutex_unlock(&slab_mutex); return ret; } #else -static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, - struct kmem_cache *s, struct kmem_cache *root_cache) +static inline int init_memcg_params(struct kmem_cache *s, + struct mem_cgroup *memcg, struct kmem_cache *root_cache) { return 0; } -static inline void memcg_free_cache_params(struct kmem_cache *s) +static inline void destroy_memcg_params(struct kmem_cache *s) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, s->align = align; s->ctor = ctor; - err = memcg_alloc_cache_params(memcg, s, root_cache); + err = init_memcg_params(s, memcg, root_cache); if (err) goto out_free_cache; @@ -330,7 +329,7 @@ out: return s; out_free_cache: - memcg_free_cache_params(s); + destroy_memcg_params(s); kmem_cache_free(kmem_cache, s); goto out; } @@ -369,6 +368,7 @@ kmem_cache_create(const char *name, size_t size, size_t align, get_online_cpus(); get_online_mems(); + memcg_get_cache_ids(); mutex_lock(&slab_mutex); @@ -407,6 +407,7 @@ kmem_cache_create(const char *name, size_t size, size_t align, out_unlock: mutex_unlock(&slab_mutex); + memcg_put_cache_ids(); put_online_mems(); put_online_cpus(); @@ -439,13 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s, *need_rcu_barrier = true; #ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) { - struct kmem_cache *root_cache = s->memcg_params->root_cache; - int memcg_id = memcg_cache_id(s->memcg_params->memcg); - - BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); - root_cache->memcg_params->memcg_caches[memcg_id] = NULL; - } + if (!is_root_cache(s)) + list_del(&s->memcg_params.list); #endif list_move(&s->list, release); return 0; @@ -482,9 +478,11 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ - int memcg_id = memcg_cache_id(memcg); + struct cgroup_subsys_state *css = mem_cgroup_css(memcg); + struct memcg_cache_array *arr; struct kmem_cache *s = NULL; char *cache_name; + int idx; get_online_cpus(); get_online_mems(); @@ -492,17 +490,27 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); /* + * The memory cgroup could have been deactivated while the cache + * creation work was pending. + */ + if (!memcg_kmem_is_active(memcg)) + goto out_unlock; + + idx = memcg_cache_id(memcg); + arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + + /* * Since per-memcg caches are created asynchronously on first * allocation (see memcg_kmem_get_cache()), several threads can try to * create the same cache, but only one of them may succeed. */ - if (cache_from_memcg_idx(root_cache, memcg_id)) + if (arr->entries[idx]) goto out_unlock; - cgroup_name(mem_cgroup_css(memcg)->cgroup, - memcg_name_buf, sizeof(memcg_name_buf)); + cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), memcg_name_buf); + css->id, memcg_name_buf); if (!cache_name) goto out_unlock; @@ -520,13 +528,15 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; } + list_add(&s->memcg_params.list, &root_cache->memcg_params.list); + /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially * initialized. */ smp_wmb(); - root_cache->memcg_params->memcg_caches[memcg_id] = s; + arr->entries[idx] = s; out_unlock: mutex_unlock(&slab_mutex); @@ -535,6 +545,37 @@ out_unlock: put_online_cpus(); } +void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) +{ + int idx; + struct memcg_cache_array *arr; + struct kmem_cache *s, *c; + + idx = memcg_cache_id(memcg); + + get_online_cpus(); + get_online_mems(); + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + if (!is_root_cache(s)) + continue; + + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + c = arr->entries[idx]; + if (!c) + continue; + + __kmem_cache_shrink(c, true); + arr->entries[idx] = NULL; + } + mutex_unlock(&slab_mutex); + + put_online_mems(); + put_online_cpus(); +} + void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { LIST_HEAD(release); @@ -546,7 +587,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) mutex_lock(&slab_mutex); list_for_each_entry_safe(s, s2, &slab_caches, list) { - if (is_root_cache(s) || s->memcg_params->memcg != memcg) + if (is_root_cache(s) || s->memcg_params.memcg != memcg) continue; /* * The cgroup is about to be freed and therefore has no charges @@ -565,18 +606,20 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) void slab_kmem_cache_release(struct kmem_cache *s) { - memcg_free_cache_params(s); + destroy_memcg_params(s); kfree(s->name); kmem_cache_free(kmem_cache, s); } void kmem_cache_destroy(struct kmem_cache *s) { - int i; + struct kmem_cache *c, *c2; LIST_HEAD(release); bool need_rcu_barrier = false; bool busy = false; + BUG_ON(!is_root_cache(s)); + get_online_cpus(); get_online_mems(); @@ -586,10 +629,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); - - if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) + for_each_memcg_cache_safe(c, c2, s) { + if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) busy = true; } @@ -619,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); - ret = __kmem_cache_shrink(cachep); + ret = __kmem_cache_shrink(cachep, false); put_online_mems(); put_online_cpus(); return ret; @@ -641,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz s->name = name; s->size = s->object_size = size; s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + slab_init_memcg_params(s); + err = __kmem_cache_create(s, flags); if (err) @@ -920,16 +964,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) { struct kmem_cache *c; struct slabinfo sinfo; - int i; if (!is_root_cache(s)) return; - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; - + for_each_memcg_cache(c, s) { memset(&sinfo, 0, sizeof(sinfo)); get_slabinfo(c, &sinfo); @@ -981,7 +1020,7 @@ int memcg_slab_show(struct seq_file *m, void *p) if (p == slab_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s) && s->memcg_params->memcg == memcg) + if (!is_root_cache(s) && s->memcg_params.memcg == memcg) cache_show(s, m); return 0; } diff --git a/mm/slob.c b/mm/slob.c index 96a86206a26b..94a7fede6d48 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } -int __kmem_cache_shrink(struct kmem_cache *d) +int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 8b8508adf9c2..06cdb1829dc9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2007,6 +2007,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) int pages; int pobjects; + preempt_disable(); do { pages = 0; pobjects = 0; @@ -2040,6 +2041,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + if (unlikely(!s->cpu_partial)) { + unsigned long flags; + + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + } + preempt_enable(); #endif } @@ -3358,69 +3367,92 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); +#define SHRINK_PROMOTE_MAX 32 + /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * kmem_cache_shrink discards empty slabs and promotes the slabs filled + * up most to the head of the partial lists. New allocations will then + * fill those up and thus they can be removed from the partial lists. * * The slabs with the least items are placed last. This results in them * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) { int node; int i; struct kmem_cache_node *n; struct page *page; struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + struct list_head discard; + struct list_head promote[SHRINK_PROMOTE_MAX]; unsigned long flags; + int ret = 0; - if (!slabs_by_inuse) - return -ENOMEM; + if (deactivate) { + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; + + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), + * so we have to make sure the change is visible. + */ + kick_all_cpus_sync(); + } flush_all(s); for_each_kmem_cache_node(s, node, n) { - if (!n->nr_partial) - continue; - - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); spin_lock_irqsave(&n->list_lock, flags); /* - * Build lists indexed by the items in use in each slab. + * Build lists of slabs to discard or promote. * * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - list_move(&page->lru, slabs_by_inuse + page->inuse); - if (!page->inuse) + int free = page->objects - page->inuse; + + /* Do not reread page->inuse */ + barrier(); + + /* We do not keep full slabs on the list */ + BUG_ON(free <= 0); + + if (free == page->objects) { + list_move(&page->lru, &discard); n->nr_partial--; + } else if (free <= SHRINK_PROMOTE_MAX) + list_move(&page->lru, promote + free - 1); } /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * Promote the slabs filled up most to the head of the + * partial list. */ - for (i = objects - 1; i > 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + list_for_each_entry_safe(page, t, &discard, lru) discard_slab(s, page); + + if (slabs_node(s, node)) + ret = 1; } - kfree(slabs_by_inuse); - return 0; + return ret; } static int slab_mem_going_offline_callback(void *arg) @@ -3429,7 +3461,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s); + __kmem_cache_shrink(s, false); mutex_unlock(&slab_mutex); return 0; @@ -3577,6 +3609,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) p->slab_cache = s; #endif } + slab_init_memcg_params(s); list_add(&s->list, &slab_caches); return s; } @@ -3635,13 +3668,10 @@ struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s, *c; s = find_mergeable(size, align, flags, name, ctor); if (s) { - int i; - struct kmem_cache *c; - s->refcount++; /* @@ -3651,10 +3681,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); - if (!c) - continue; + for_each_memcg_cache(c, s) { c->object_size = s->object_size; c->inuse = max_t(int, c->inuse, ALIGN(size, sizeof(void *))); @@ -4691,12 +4718,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf) static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { - if (buf[0] == '1') { - int rc = kmem_cache_shrink(s); - - if (rc) - return rc; - } else + if (buf[0] == '1') + kmem_cache_shrink(s); + else return -EINVAL; return length; } @@ -4920,7 +4944,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, err = attribute->store(s, buf, len); #ifdef CONFIG_MEMCG_KMEM if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { - int i; + struct kmem_cache *c; mutex_lock(&slab_mutex); if (s->max_attr_size < len) @@ -4943,11 +4967,8 @@ static ssize_t slab_attr_store(struct kobject *kobj, * directly either failed or succeeded, in which case we loop * through the descendants with best-effort propagation. */ - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); - if (c) - attribute->store(c, buf, len); - } + for_each_memcg_cache(c, s) + attribute->store(c, buf, len); mutex_unlock(&slab_mutex); } #endif @@ -4964,7 +4985,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) if (is_root_cache(s)) return; - root_cache = s->memcg_params->root_cache; + root_cache = s->memcg_params.root_cache; /* * This mean this cache had no attribute written. Therefore, no point @@ -5044,7 +5065,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) { #ifdef CONFIG_MEMCG_KMEM if (!is_root_cache(s)) - return s->memcg_params->root_cache->memcg_kset; + return s->memcg_params.root_cache->memcg_kset; #endif return slab_kset; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 224dd298fdcd..5e8eadd71bac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 -static unsigned long shrink_slabs(struct shrink_control *shrinkctl, - struct shrinker *shrinker, - unsigned long nr_scanned, - unsigned long nr_eligible) +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; @@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, } /** - * shrink_node_slabs - shrink slab caches of a given node + * shrink_slab - shrink slab caches * @gfp_mask: allocation context * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target * @nr_scanned: pressure numerator * @nr_eligible: pressure denominator * @@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * + * @memcg specifies the memory cgroup to target. If it is not NULL, + * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan + * objects from the memory cgroup specified. Otherwise all shrinkers + * are called, and memcg aware shrinkers are supposed to scan the + * global list then. + * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example * passes the number of pages scanned and the number of pages on the @@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, * * Returns the number of reclaimed slab objects. */ -unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, - unsigned long nr_scanned, - unsigned long nr_eligible) +static unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + unsigned long nr_scanned, + unsigned long nr_eligible) { struct shrinker *shrinker; unsigned long freed = 0; + if (memcg && !memcg_kmem_is_active(memcg)) + return 0; + if (nr_scanned == 0) nr_scanned = SWAP_CLUSTER_MAX; @@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, + .memcg = memcg, }; + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + continue; + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); + freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); } up_read(&shrinker_rwsem); @@ -404,6 +419,29 @@ out: return freed; } +void drop_slab_node(int nid) +{ + unsigned long freed; + + do { + struct mem_cgroup *memcg = NULL; + + freed = 0; + do { + freed += shrink_slab(GFP_KERNEL, nid, memcg, + 1000, 1000); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + } while (freed > 10); +} + +void drop_slab(void) +{ + int nid; + + for_each_online_node(nid) + drop_slab_node(nid); +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone, static bool shrink_zone(struct zone *zone, struct scan_control *sc, bool is_classzone) { + struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; @@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; + unsigned long scanned; struct lruvec *lruvec; int swappiness; @@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, lruvec = mem_cgroup_zone_lruvec(zone, memcg); swappiness = mem_cgroup_swappiness(memcg); + scanned = sc->nr_scanned; shrink_lruvec(lruvec, swappiness, sc, &lru_pages); zone_lru_pages += lru_pages; + if (memcg && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), + memcg, sc->nr_scanned - scanned, + lru_pages); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Shrink the slab caches in the same proportion that * the eligible LRU pages were scanned. */ - if (global_reclaim(sc) && is_classzone) { - struct reclaim_state *reclaim_state; - - shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), - sc->nr_scanned - nr_scanned, - zone_lru_pages); - - reclaim_state = current->reclaim_state; - if (reclaim_state) { - sc->nr_reclaimed += - reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } + if (global_reclaim(sc) && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; } vmpressure(sc->gfp_mask, sc->target_mem_cgroup, diff --git a/mm/workingset.c b/mm/workingset.c index f7216fa7da27..aa017133744b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); pages = node_present_pages(sc->nid); @@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, } static enum lru_status shadow_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { @@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out; } - list_del_init(item); + list_lru_isolate(lru, item); spin_unlock(lru_lock); /* @@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, - shadow_lru_isolate, NULL, &sc->nr_to_scan); + ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, + shadow_lru_isolate, NULL); local_irq_enable(); return ret; } diff --git a/mm/zbud.c b/mm/zbud.c index 4e387bea702e..2ee4e4520493 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zbud_zpool_create(char *name, gfp_t gfp, + struct zpool_ops *zpool_ops) { return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); } diff --git a/mm/zpool.c b/mm/zpool.c index 739cdf0d183a..bacdab6e47de 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver) /** * zpool_create_pool() - Create a new zpool * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @name The name of the zpool (e.g. zram0, zswap) * @gfp The GFP flags to use when allocating the pool. * @ops The optional ops callback. * @@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver) * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, + struct zpool_ops *ops) { struct zpool_driver *driver; struct zpool *zpool; @@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) zpool->type = driver->type; zpool->driver = driver; - zpool->pool = driver->create(gfp, ops); + zpool->pool = driver->create(name, gfp, ops); zpool->ops = ops; if (!zpool->pool) { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b72403927aa4..0dec1fa5f656 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -91,6 +91,7 @@ #include <linux/hardirq.h> #include <linux/spinlock.h> #include <linux/types.h> +#include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> @@ -168,6 +169,22 @@ enum fullness_group { ZS_FULL }; +enum zs_stat_type { + OBJ_ALLOCATED, + OBJ_USED, + NR_ZS_STAT_TYPE, +}; + +#ifdef CONFIG_ZSMALLOC_STAT + +static struct dentry *zs_stat_root; + +struct zs_size_stat { + unsigned long objs[NR_ZS_STAT_TYPE]; +}; + +#endif + /* * number of size_classes */ @@ -200,6 +217,10 @@ struct size_class { /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; +#ifdef CONFIG_ZSMALLOC_STAT + struct zs_size_stat stats; +#endif + spinlock_t lock; struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; @@ -217,10 +238,16 @@ struct link_free { }; struct zs_pool { + char *name; + struct size_class **size_class; gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; + +#ifdef CONFIG_ZSMALLOC_STAT + struct dentry *stat_dentry; +#endif }; /* @@ -246,9 +273,9 @@ struct mapping_area { #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) { - return zs_create_pool(gfp); + return zs_create_pool(name, gfp); } static void zs_zpool_destroy(void *pool) @@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } +#ifdef CONFIG_ZSMALLOC_STAT + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] += cnt; +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ + class->stats.objs[type] -= cnt; +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return class->stats.objs[type]; +} + +static int __init zs_stat_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); + if (!zs_stat_root) + return -ENOMEM; + + return 0; +} + +static void __exit zs_stat_exit(void) +{ + debugfs_remove_recursive(zs_stat_root); +} + +static int zs_stats_size_show(struct seq_file *s, void *v) +{ + int i; + struct zs_pool *pool = s->private; + struct size_class *class; + int objs_per_zspage; + unsigned long obj_allocated, obj_used, pages_used; + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + + seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", + "obj_allocated", "obj_used", "pages_used"); + + for (i = 0; i < zs_size_classes; i++) { + class = pool->size_class[i]; + + if (class->index != i) + continue; + + spin_lock(&class->lock); + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + obj_used = zs_stat_get(class, OBJ_USED); + spin_unlock(&class->lock); + + objs_per_zspage = get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + pages_used = obj_allocated / objs_per_zspage * + class->pages_per_zspage; + + seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, + class->size, obj_allocated, obj_used, pages_used); + + total_objs += obj_allocated; + total_used_objs += obj_used; + total_pages += pages_used; + } + + seq_puts(s, "\n"); + seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", + total_objs, total_used_objs, total_pages); + + return 0; +} + +static int zs_stats_size_open(struct inode *inode, struct file *file) +{ + return single_open(file, zs_stats_size_show, inode->i_private); +} + +static const struct file_operations zs_stat_size_ops = { + .open = zs_stats_size_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + struct dentry *entry; + + if (!zs_stat_root) + return -ENODEV; + + entry = debugfs_create_dir(name, zs_stat_root); + if (!entry) { + pr_warn("debugfs dir <%s> creation failed\n", name); + return -ENOMEM; + } + pool->stat_dentry = entry; + + entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, + pool->stat_dentry, pool, &zs_stat_size_ops); + if (!entry) { + pr_warn("%s: debugfs file entry <%s> creation failed\n", + name, "obj_in_classes"); + return -ENOMEM; + } + + return 0; +} + +static void zs_pool_stat_destroy(struct zs_pool *pool) +{ + debugfs_remove_recursive(pool->stat_dentry); +} + +#else /* CONFIG_ZSMALLOC_STAT */ + +static inline void zs_stat_inc(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline void zs_stat_dec(struct size_class *class, + enum zs_stat_type type, unsigned long cnt) +{ +} + +static inline unsigned long zs_stat_get(struct size_class *class, + enum zs_stat_type type) +{ + return 0; +} + +static int __init zs_stat_init(void) +{ + return 0; +} + +static void __exit zs_stat_exit(void) +{ +} + +static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +{ + return 0; +} + +static inline void zs_pool_stat_destroy(struct zs_pool *pool) +{ +} + +#endif + unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); @@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) set_zspage_mapping(first_page, class->index, ZS_EMPTY); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); + spin_lock(&class->lock); + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); } obj = (unsigned long)first_page->freelist; @@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) kunmap_atomic(vaddr); first_page->inuse++; + zs_stat_inc(class, OBJ_USED, 1); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(pool, first_page); spin_unlock(&class->lock); @@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj) first_page->inuse--; fullness = fix_fullness_group(pool, first_page); + + zs_stat_dec(class, OBJ_USED, 1); + if (fullness == ZS_EMPTY) + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); + spin_unlock(&class->lock); if (fullness == ZS_EMPTY) { @@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free); * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(gfp_t flags) +struct zs_pool *zs_create_pool(char *name, gfp_t flags) { int i; struct zs_pool *pool; @@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags) if (!pool) return NULL; + pool->name = kstrdup(name, GFP_KERNEL); + if (!pool->name) { + kfree(pool); + return NULL; + } + pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), GFP_KERNEL); if (!pool->size_class) { + kfree(pool->name); kfree(pool); return NULL; } @@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags) pool->flags = flags; + if (zs_pool_stat_create(name, pool)) + goto err; + return pool; err: @@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool) { int i; + zs_pool_stat_destroy(pool); + for (i = 0; i < zs_size_classes; i++) { int fg; struct size_class *class = pool->size_class[i]; @@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool) } kfree(pool->size_class); + kfree(pool->name); kfree(pool); } EXPORT_SYMBOL_GPL(zs_destroy_pool); @@ -1250,17 +1460,30 @@ static int __init zs_init(void) { int ret = zs_register_cpu_notifier(); - if (ret) { - zs_unregister_cpu_notifier(); - return ret; - } + if (ret) + goto notifier_fail; init_zs_size_classes(); #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif + + ret = zs_stat_init(); + if (ret) { + pr_err("zs stat initialization failed\n"); + goto stat_fail; + } return 0; + +stat_fail: +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif +notifier_fail: + zs_unregister_cpu_notifier(); + + return ret; } static void __exit zs_exit(void) @@ -1269,6 +1492,8 @@ static void __exit zs_exit(void) zpool_unregister_driver(&zs_zpool_driver); #endif zs_unregister_cpu_notifier(); + + zs_stat_exit(); } module_init(zs_init); diff --git a/mm/zswap.c b/mm/zswap.c index 0cfce9bc51e4..4249e82ff934 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -906,11 +906,12 @@ static int __init init_zswap(void) pr_info("loading zswap\n"); - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, + &zswap_zpool_ops); if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { pr_info("%s zpool not available\n", zswap_zpool_type); zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, &zswap_zpool_ops); } if (!zswap_pool) { diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index c2a75c6957a1..2379c1b4efb2 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -47,6 +47,10 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg) return; percpu_counter_destroy(&cg_proto->sockets_allocated); + + if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) + static_key_slow_dec(&memcg_socket_limit_enabled); + } EXPORT_SYMBOL(tcp_destroy_cgroup); |