From 6ee8e25fc3e916193bce4ebb43d5439e1e2144ab Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2015 14:08:32 -0800 Subject: fsnotify: fix handling of renames in audit Commit e9fd702a58c4 ("audit: convert audit watches to use fsnotify instead of inotify") broke handling of renames in audit. Audit code wants to update inode number of an inode corresponding to watched name in a directory. When something gets renamed into a directory to a watched name, inotify previously passed moved inode to audit code however new fsnotify code passes directory inode where the change happened. That confuses audit and it starts watching parent directory instead of a file in a directory. This can be observed for example by doing: cd /tmp touch foo bar auditctl -w /tmp/foo touch foo mv bar foo touch foo In audit log we see events like: type=CONFIG_CHANGE msg=audit(1423563584.155:90): auid=1000 ses=2 op="updated rules" path="/tmp/foo" key=(null) list=4 res=1 ... type=PATH msg=audit(1423563584.155:91): item=2 name="bar" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE type=PATH msg=audit(1423563584.155:91): item=3 name="foo" inode=1046842 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE type=PATH msg=audit(1423563584.155:91): item=4 name="foo" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=CREATE ... and that's it - we see event for the first touch after creating the audit rule, we see events for rename but we don't see any event for the last touch. However we start seeing events for unrelated stuff happening in /tmp. Fix the problem by passing moved inode as data in the FS_MOVED_FROM and FS_MOVED_TO events instead of the directory where the change happens. This doesn't introduce any new problems because noone besides audit_watch.c cares about the passed value: fs/notify/fanotify/fanotify.c cares only about FSNOTIFY_EVENT_PATH events. fs/notify/dnotify/dnotify.c doesn't care about passed 'data' value at all. fs/notify/inotify/inotify_fsnotify.c uses 'data' only for FSNOTIFY_EVENT_PATH. kernel/audit_tree.c doesn't care about passed 'data' at all. kernel/audit_watch.c expects moved inode as 'data'. Fixes: e9fd702a58c49db ("audit: convert audit watches to use fsnotify instead of inotify") Signed-off-by: Jan Kara Cc: Paul Moore Cc: Eric Paris Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fsnotify.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index 1c804b057fb1..7ee1774edee5 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -101,8 +101,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, new_dir_mask |= FS_ISDIR; } - fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie); - fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie); + fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name, + fs_cookie); + fsnotify(new_dir, new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_name, + fs_cookie); if (target) fsnotify_link_count(target); -- cgit v1.2.3 From ccaafd7fd039aebc9359a9799f8558b01f1c2adc Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Feb 2015 14:09:35 -0800 Subject: mm: don't use compound_head() in virt_to_head_page() compound_head() is implemented with assumption that there would be race condition when checking tail flag. This assumption is only true when we try to access arbitrary positioned struct page. The situation that virt_to_head_page() is called is different case. We call virt_to_head_page() only in the range of allocated pages, so there is no race condition on tail flag. In this case, we don't need to handle race condition and we can reduce overhead slightly. This patch implements compound_head_fast() which is similar with compound_head() except tail flag race handling. And then, virt_to_head_page() uses this optimized function to improve performance. I saw 1.8% win in a fast-path loop over kmem_cache_alloc/free, (14.063 ns -> 13.810 ns) if target object is on tail page. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index dd5ea3016fc4..2c6fd3c5424a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -446,6 +446,12 @@ static inline struct page *compound_head_by_tail(struct page *tail) return tail; } +/* + * Since either compound page could be dismantled asynchronously in THP + * or we access asynchronously arbitrary positioned struct page, there + * would be tail flag race. To handle this race, we should call + * smp_rmb() before checking tail flag. compound_head_by_tail() did it. + */ static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) @@ -453,6 +459,18 @@ static inline struct page *compound_head(struct page *page) return page; } +/* + * If we access compound page synchronously such as access to + * allocated page, there is no need to handle tail flag race, so we can + * check tail flag directly without any synchronization primitive. + */ +static inline struct page *compound_head_fast(struct page *page) +{ + if (unlikely(PageTail(page))) + return page->first_page; + return page; +} + /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test @@ -531,7 +549,14 @@ static inline void get_page(struct page *page) static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); - return compound_head(page); + + /* + * We don't need to worry about synchronization of tail flag + * when we call virt_to_head_page() since it is only called for + * already allocated page and this page won't be freed until + * this virt_to_head_page() is finished. So use _fast variant. + */ + return compound_head_fast(page); } /* -- cgit v1.2.3 From c8d78c1823f46519473949d33f0d1d33fe21ea16 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:46 -0800 Subject: mm: replace remap_file_pages() syscall with emulation remap_file_pages(2) was invented to be able efficiently map parts of huge file into limited 32-bit virtual address space such as in database workloads. Nonlinear mappings are pain to support and it seems there's no legitimate use-cases nowadays since 64-bit systems are widely available. Let's drop it and get rid of all these special-cased code. The patch replaces the syscall with emulation which creates new VMA on each remap_file_pages(), unless they it can be merged with an adjacent one. I didn't find *any* real code that uses remap_file_pages(2) to test emulation impact on. I've checked Debian code search and source of all packages in ALT Linux. No real users: libc wrappers, mentions in strace, gdb, valgrind and this kind of stuff. There are few basic tests in LTP for the syscall. They work just fine with emulation. To test performance impact, I've written small test case which demonstrate pretty much worst case scenario: map 4G shmfs file, write to begin of every page pgoff of the page, remap pages in reverse order, read every page. The test creates 1 million of VMAs if emulation is in use, so I had to set vm.max_map_count to 1100000 to avoid -ENOMEM. Before: 23.3 ( +- 4.31% ) seconds After: 43.9 ( +- 0.85% ) seconds Slowdown: 1.88x I believe we can live with that. Test case: #define _GNU_SOURCE #include #include #include #include #define MB (1024UL * 1024) #define SIZE (4096 * MB) int main(int argc, char **argv) { unsigned long *p; long i, pass; for (pass = 0; pass < 10; pass++) { p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) { perror("mmap"); return -1; } for (i = 0; i < SIZE / 4096; i++) p[i * 4096 / sizeof(*p)] = i; for (i = 0; i < SIZE / 4096; i++) { if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096, 0, (SIZE - 4096 * (i + 1)) >> 12, 0)) { perror("remap_file_pages"); return -1; } } for (i = SIZE / 4096 - 1; i >= 0; i--) assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1); munmap(p, SIZE); } return 0; } [akpm@linux-foundation.org: fix spello] [sasha.levin@oracle.com: initialize populate before usage] [sasha.levin@oracle.com: grab file ref to prevent race while mmaping] Signed-off-by: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Jones Cc: Linus Torvalds Cc: Armin Rigo Signed-off-by: Sasha Levin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13077b6..60c4996df7f3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, - unsigned long size, pgoff_t pgoff); +static inline int generic_file_remap_pages(struct vm_area_struct *vma, + unsigned long addr, unsigned long size, pgoff_t pgoff) +{ + BUG(); + return 0; +} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); -- cgit v1.2.3 From 8a5f14a23177061ec11daeaa3d09d0765d785c47 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:49 -0800 Subject: mm: drop support of non-linear mapping from unmap/zap codepath We have remap_file_pages(2) emulation in -mm tree for few release cycles and we plan to have it mainline in v3.20. This patchset removes rest of VM_NONLINEAR infrastructure. Patches 1-8 take care about generic code. They are pretty straight-forward and can be applied without other of patches. Rest patches removes pte_file()-related stuff from architecture-specific code. It usually frees up one bit in non-present pte. I've tried to reuse that bit for swap offset, where I was able to figure out how to do that. For obvious reason I cannot test all that arch-specific code and would like to see acks from maintainers. In total, remap_file_pages(2) required about 1.4K lines of not-so-trivial kernel code. That's too much for functionality nobody uses. Tested-by: Felipe Balbi This patch (of 38): We don't create non-linear mappings anymore. Let's drop code which handles them on unmap/zap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c6fd3c5424a..600ef5ed4698 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1146,7 +1146,6 @@ extern void user_shm_unlock(size_t, struct user_struct *); * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { - struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ -- cgit v1.2.3 From 9b4bdd2ffab9557ac43af7dff02e7dab1c8c58bd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:51 -0800 Subject: mm: drop support of non-linear mapping from fault codepath We don't create non-linear mappings anymore. Let's drop code which handles them on page fault. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 600ef5ed4698..376e5c325dee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,21 +206,19 @@ extern unsigned int kobjsize(const void *objp); extern pgprot_t protection_map[16]; #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ -#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ -#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ -#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ -#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ -#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ -#define FAULT_FLAG_TRIED 0x40 /* second try */ -#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ +#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ +#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ +#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ +#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ +#define FAULT_FLAG_TRIED 0x20 /* Second try */ +#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * - * pgoff should be used in favour of virtual_address, if possible. If pgoff - * is used, one may implement ->remap_pages to get nonlinear mapping support. + * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { unsigned int flags; /* FAULT_FLAG_xxx flags */ -- cgit v1.2.3 From d83a08db5ba6072caa658745881f4baa9bad6a08 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:54 -0800 Subject: mm: drop vm_ops->remap_pages and generic_file_remap_pages() stub Nobody uses it anymore. [akpm@linux-foundation.org: fix filemap_xip.c] Signed-off-by: Kirill A. Shutemov Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ------ include/linux/mm.h | 3 --- 2 files changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 60c4996df7f3..47f557c7ef7e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2481,12 +2481,6 @@ extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); -static inline int generic_file_remap_pages(struct vm_area_struct *vma, - unsigned long addr, unsigned long size, pgoff_t pgoff) -{ - BUG(); - return 0; -} int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 376e5c325dee..2ddd9d1d6268 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -285,9 +285,6 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr); #endif - /* called by sys_remap_file_pages() to populate non-linear mapping */ - int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, - unsigned long size, pgoff_t pgoff); }; struct mmu_gather; -- cgit v1.2.3 From 27ba0644ea9dfe6e7693abc85837b60e40583b96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:09:59 -0800 Subject: rmap: drop support of non-linear mappings We don't create non-linear mappings anymore. Let's drop code which handles them in rmap. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 +--- include/linux/mm.h | 6 ------ include/linux/mm_types.h | 4 +--- include/linux/rmap.h | 2 -- 4 files changed, 2 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 47f557c7ef7e..60acab209701 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -401,7 +401,6 @@ struct address_space { spinlock_t tree_lock; /* and lock protecting it */ atomic_t i_mmap_writable;/* count VM_SHARED mappings */ struct rb_root i_mmap; /* tree of private and shared mappings */ - struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ @@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping) */ static inline int mapping_mapped(struct address_space *mapping) { - return !RB_EMPTY_ROOT(&mapping->i_mmap) || - !list_empty(&mapping->i_mmap_nonlinear); + return !RB_EMPTY_ROOT(&mapping->i_mmap); } /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ddd9d1d6268..18391eec4864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) -static inline void vma_nonlinear_insert(struct vm_area_struct *vma, - struct list_head *list) -{ - list_add_tail(&vma->shared.nonlinear, list); -} - void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6d34aa266a8c..3b1d20fb0848 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -273,15 +273,13 @@ struct vm_area_struct { /* * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree, or - * linkage of vma in the address_space->i_mmap_nonlinear list. + * linkage into the address_space->i_mmap interval tree. */ union { struct { struct rb_node rb; unsigned long rb_subtree_last; } linear; - struct list_head nonlinear; } shared; /* diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d9d7e7e56352..b38f559130d5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); * arg: passed to rmap_one() and invalid_vma() * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition - * file_nonlinear: for handling file nonlinear mapping * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ @@ -255,7 +254,6 @@ struct rmap_walk_control { int (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); - int (*file_nonlinear)(struct page *, struct address_space *, void *arg); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -- cgit v1.2.3 From ac51b934f3912582d3c897c6c4d09b32ea57b2c7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:02 -0800 Subject: mm: replace vma->sharead.linear with vma->shared After removing vma->shared.nonlinear we have only one member of vma->shared union, which doesn't make much sense. This patch drops the union and move struct vma->shared.linear to vma->shared. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3b1d20fb0848..07c8bd3f7b48 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -275,11 +275,9 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. */ - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } linear; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; } shared; /* -- cgit v1.2.3 From 0661a33611fca12570cba48d9344ce68834ee86c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:10:04 -0800 Subject: mm: remove rest usage of VM_NONLINEAR and pte_file() One bit in ->vm_flags is unused now! Signed-off-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - include/linux/swapops.h | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 18391eec4864..a0da685bdb82 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_ARCH_2 0x02000000 #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6adfb7bfbf44..50cbc876be56 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); + return !pte_none(pte) && !pte_present_nonuma(pte); } #endif @@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte) { swp_entry_t arch_entry; - BUG_ON(pte_file(pte)); if (pte_swp_soft_dirty(pte)) pte = pte_swp_clear_soft_dirty(pte); arch_entry = __pte_to_swp_entry(pte); @@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry) swp_entry_t arch_entry; arch_entry = __swp_entry(swp_type(entry), swp_offset(entry)); - BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } -- cgit v1.2.3 From 753162cd849c45580fb5aaa7f3597c81e74e391c Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 10 Feb 2015 14:11:36 -0800 Subject: mm: hugetlb: fix type of hugetlb_treat_as_movable variable hugetlb_treat_as_movable declared as unsigned long, but proc_dointvec() used for parsing it: static struct ctl_table vm_table[] = { ... { .procname = "hugepages_treat_as_movable", .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, This seems harmless, but it's better to use int type here. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Manfred Spraul Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 431b7fc605c9..7d7856359920 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -86,7 +86,7 @@ void free_huge_page(struct page *page); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif -extern unsigned long hugepages_treat_as_movable; +extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; -- cgit v1.2.3 From dbf22eb6d8675fc173154d9f1bd1bd0fda53a001 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:41 -0800 Subject: memcg: zap __memcg_{charge,uncharge}_slab They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's zap them and call these functions directly. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7c95af8d552c..18ccb2988979 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -403,8 +403,9 @@ void memcg_update_array_size(int num_groups); struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); void __memcg_kmem_put_cache(struct kmem_cache *cachep); -int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); -void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, + unsigned long nr_pages); +void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); int __memcg_cleanup_cache_params(struct kmem_cache *s); -- cgit v1.2.3 From 3e0350a36414a73c5c2d1e354f8c0ab4ace1296d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:44 -0800 Subject: memcg: zap memcg_name argument of memcg_create_kmem_cache Instead of passing the name of the memory cgroup which the cache is created for in the memcg_name_argument, let's obtain it immediately in memcg_create_kmem_cache. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9a139b637069..eca9ed303a1b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -117,8 +117,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *, - const char *); + struct kmem_cache *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); -- cgit v1.2.3 From d5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 10 Feb 2015 14:11:47 -0800 Subject: memcg: zap memcg_slab_caches and memcg_slab_mutex mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to the given cgroup. Currently, it is only used on css free in order to destroy all caches corresponding to the memory cgroup being freed. The list is protected by memcg_slab_mutex. The mutex is also used to protect kmem_cache->memcg_params->memcg_caches arrays and synchronizes kmem_cache_destroy vs memcg_unregister_all_caches. However, we can perfectly get on without these two. To destroy all caches corresponding to a memory cgroup, we can walk over the global list of kmem caches, slab_caches, and we can do all the synchronization stuff using the slab_mutex instead of the memcg_slab_mutex. This patch therefore gets rid of the memcg_slab_caches and memcg_slab_mutex. Apart from this nice cleanup, it also: - assures that rcu_barrier() is called once at max when a root cache is destroyed or a memory cgroup is freed, no matter how many caches have SLAB_DESTROY_BY_RCU flag set; - fixes the race between kmem_cache_destroy and kmem_cache_create that exists, because memcg_cleanup_cache_params, which is called from kmem_cache_destroy after checking that kmem_cache->refcount=0, releases the slab_mutex, which gives kmem_cache_create a chance to make an alias to a cache doomed to be destroyed. Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- include/linux/slab.h | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 18ccb2988979..fb212e1d700d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -407,8 +407,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages); void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages); -int __memcg_cleanup_cache_params(struct kmem_cache *s); - /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. * @gfp: the gfp allocation flags. diff --git a/include/linux/slab.h b/include/linux/slab.h index eca9ed303a1b..2e3b448cfa2d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,8 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, - struct kmem_cache *); +void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); +void memcg_destroy_kmem_caches(struct mem_cgroup *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -490,7 +490,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * Child caches will hold extra metadata needed for its operation. Fields are: * * @memcg: pointer to the memcg this cache belongs to - * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from */ struct memcg_cache_params { @@ -502,7 +501,6 @@ struct memcg_cache_params { }; struct { struct mem_cgroup *memcg; - struct list_head list; struct kmem_cache *root_cache; }; }; -- cgit v1.2.3