summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-05 14:27:38 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-05 14:27:38 -0700
commit6c0f568e84a3cfc775682311d65205462c3f3bc1 (patch)
tree5105a137a9ea2459d55e895d3c096bbd31274724 /include
parentc82199061009d1561e31e17fca5e47a87cb7ff4c (diff)
parent559ec2f8fd50981821621f52db5e1a8ffcf8d792 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge patch-bomb from Andrew Morton: - a few misc things - Andy's "ambient capabilities" - fs/nofity updates - the ocfs2 queue - kernel/watchdog.c updates and feature work. - some of MM. Includes Andrea's userfaultfd feature. [ Hadn't noticed that userfaultfd was 'default y' when applying the patches, so that got fixed in this merge instead. We do _not_ mark new features that nobody uses yet 'default y' - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (118 commits) mm/hugetlb.c: make vma_has_reserves() return bool mm/madvise.c: make madvise_behaviour_valid() return bool mm/memory.c: make tlb_next_batch() return bool mm/dmapool.c: change is_page_busy() return from int to bool mm: remove struct node_active_region mremap: simplify the "overlap" check in mremap_to() mremap: don't do uneccesary checks if new_len == old_len mremap: don't do mm_populate(new_addr) on failure mm: move ->mremap() from file_operations to vm_operations_struct mremap: don't leak new_vma if f_op->mremap() fails mm/hugetlb.c: make vma_shareable() return bool mm: make GUP handle pfn mapping unless FOLL_GET is requested mm: fix status code which move_pages() returns for zero page mm: memcontrol: bring back the VM_BUG_ON() in mem_cgroup_swapout() genalloc: add support of multiple gen_pools per device genalloc: add name arg to gen_pool_get() and devm_gen_pool_create() mm/memblock: WARN_ON when nid differs from overlap region Documentation/features/vm: add feature description and arch support status for batched TLB flush after unmap mm: defer flush of writable TLB entries mm: send one IPI per CPU to TLB flush all entries after unmapping pages ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/cred.h8
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/fsnotify_backend.h55
-rw-r--r--include/linux/genalloc.h6
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/mm.h5
-rw-r--r--include/linux/mm_types.h12
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/nmi.h15
-rw-r--r--include/linux/rmap.h3
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/seq_file.h35
-rw-r--r--include/linux/slab.h10
-rw-r--r--include/linux/smpboot.h11
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/linux/userfaultfd_k.h85
-rw-r--r--include/linux/wait.h5
-rw-r--r--include/linux/watchdog.h8
-rw-r--r--include/trace/events/tlb.h3
-rw-r--r--include/uapi/linux/Kbuild1
-rw-r--r--include/uapi/linux/prctl.h7
-rw-r--r--include/uapi/linux/securebits.h11
-rw-r--r--include/uapi/linux/userfaultfd.h169
23 files changed, 433 insertions, 51 deletions
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
kernel_cap_t cap_permitted; /* caps we're permitted */
kernel_cap_t cap_effective; /* caps we can actually use */
kernel_cap_t cap_bset; /* capability bounding set */
+ kernel_cap_t cap_ambient; /* Ambient capability set */
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
}
#endif
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+ return cap_issubset(cred->cap_ambient,
+ cap_intersect(cred->cap_permitted,
+ cred->cap_inheritable));
+}
+
/**
* get_new_cred - Get a reference on a new set of credentials
* @cred: The new credentials to reference
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fbd780c33c5f..864203c10dbc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1612,7 +1612,6 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
- int (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 65a517dd32f7..e0727d77feaf 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -195,40 +195,49 @@ struct fsnotify_group {
#define FSNOTIFY_EVENT_INODE 2
/*
- * a mark is simply an object attached to an in core inode which allows an
+ * A mark is simply an object attached to an in core inode which allows an
* fsnotify listener to indicate they are either no longer interested in events
* of a type matching mask or only interested in those events.
*
- * these are flushed when an inode is evicted from core and may be flushed
- * when the inode is modified (as seen by fsnotify_access). Some fsnotify users
- * (such as dnotify) will flush these when the open fd is closed and not at
- * inode eviction or modification.
+ * These are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access). Some fsnotify
+ * users (such as dnotify) will flush these when the open fd is closed and not
+ * at inode eviction or modification.
+ *
+ * Text in brackets is showing the lock(s) protecting modifications of a
+ * particular entry. obj_lock means either inode->i_lock or
+ * mnt->mnt_root->d_lock depending on the mark type.
*/
struct fsnotify_mark {
- __u32 mask; /* mask this mark is for */
- /* we hold ref for each i_list and g_list. also one ref for each 'thing'
+ /* Mask this mark is for [mark->lock, group->mark_mutex] */
+ __u32 mask;
+ /* We hold one for presence in g_list. Also one ref for each 'thing'
* in kernel that found and may be using this mark. */
- atomic_t refcnt; /* active things looking at this mark */
- struct fsnotify_group *group; /* group this mark is for */
- struct list_head g_list; /* list of marks by group->i_fsnotify_marks
- * Also reused for queueing mark into
- * destroy_list when it's waiting for
- * the end of SRCU period before it can
- * be freed */
- spinlock_t lock; /* protect group and inode */
- struct hlist_node obj_list; /* list of marks for inode / vfsmount */
- struct list_head free_list; /* tmp list used when freeing this mark */
- union {
+ atomic_t refcnt;
+ /* Group this mark is for. Set on mark creation, stable until last ref
+ * is dropped */
+ struct fsnotify_group *group;
+ /* List of marks by group->i_fsnotify_marks. Also reused for queueing
+ * mark into destroy_list when it's waiting for the end of SRCU period
+ * before it can be freed. [group->mark_mutex] */
+ struct list_head g_list;
+ /* Protects inode / mnt pointers, flags, masks */
+ spinlock_t lock;
+ /* List of marks for inode / vfsmount [obj_lock] */
+ struct hlist_node obj_list;
+ union { /* Object pointer [mark->lock, group->mark_mutex] */
struct inode *inode; /* inode this mark is associated with */
struct vfsmount *mnt; /* vfsmount this mark is associated with */
};
- __u32 ignored_mask; /* events types to ignore */
+ /* Events types to ignore [mark->lock, group->mark_mutex] */
+ __u32 ignored_mask;
#define FSNOTIFY_MARK_FLAG_INODE 0x01
#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02
#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08
#define FSNOTIFY_MARK_FLAG_ALIVE 0x10
- unsigned int flags; /* vfsmount or inode mark? */
+#define FSNOTIFY_MARK_FLAG_ATTACHED 0x20
+ unsigned int flags; /* flags [mark->lock] */
void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
};
@@ -345,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_
/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group);
-extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
- struct fsnotify_group *group);
+/* detach mark from inode / mount list, group list, drop inode reference */
+extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
+/* free mark */
+extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* run all the marks in a group, and clear all of the vfsmount marks */
extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
/* run all the marks in a group, and clear all of the inode marks */
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5383bb1394a1..7ff168d06967 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -59,6 +59,8 @@ struct gen_pool {
genpool_algo_t algo; /* allocation function */
void *data;
+
+ const char *name;
};
/*
@@ -118,8 +120,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
unsigned long start, unsigned int nr, void *data);
extern struct gen_pool *devm_gen_pool_create(struct device *dev,
- int min_alloc_order, int nid);
-extern struct gen_pool *gen_pool_get(struct device *dev);
+ int min_alloc_order, int nid, const char *name);
+extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
size_t size);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 869b21dcf503..e691b6a23f72 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
const char namefmt[], ...);
#define kthread_create(threadfn, data, namefmt, arg...) \
- kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+ kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf6f117fcf4d..8b257c43855b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MAYSHARE 0x00000080
#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
+#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */
#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
#define VM_LOCKED 0x00002000
#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
@@ -245,6 +247,7 @@ struct vm_fault {
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
+ int (*mremap)(struct vm_area_struct * area);
int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -1833,7 +1836,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
- struct mempolicy *);
+ struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 15549578d559..c8d0a73d64c4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -256,6 +256,16 @@ struct vm_region {
* this region */
};
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+ struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
+
/*
* This struct defines a memory VMM memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -322,6 +332,7 @@ struct vm_area_struct {
#ifdef CONFIG_NUMA
struct mempolicy *vm_policy; /* NUMA policy for the VMA */
#endif
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
};
struct core_thread {
@@ -543,6 +554,7 @@ enum tlb_flush_reason {
TLB_REMOTE_SHOOTDOWN,
TLB_LOCAL_SHOOTDOWN,
TLB_LOCAL_MM_SHOOTDOWN,
+ TLB_REMOTE_SEND_IPI,
NR_TLB_FLUSH_REASONS,
};
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..ac00e2050943 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -690,14 +690,6 @@ struct zonelist {
#endif
};
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-struct node_active_region {
- unsigned long start_pfn;
- unsigned long end_pfn;
- int nid;
-};
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
#ifndef CONFIG_DISCONTIGMEM
/* The array of struct pages - for discontigmem use pgdat->lmem_map */
extern struct page *mem_map;
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f94da0e65dea..a91adf6e02f2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -27,9 +27,7 @@ static inline void touch_nmi_watchdog(void)
#if defined(CONFIG_HARDLOCKUP_DETECTOR)
extern void hardlockup_detector_disable(void);
#else
-static inline void hardlockup_detector_disable(void)
-{
-}
+static inline void hardlockup_detector_disable(void) {}
#endif
/*
@@ -80,6 +78,17 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
extern int proc_watchdog_cpumask(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+extern int lockup_detector_suspend(void);
+extern void lockup_detector_resume(void);
+#else
+static inline int lockup_detector_suspend(void)
+{
+ return 0;
+}
+
+static inline void lockup_detector_resume(void)
+{
+}
#endif
#ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..29446aeef36e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9), /* don't age */
TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+ TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible
+ * and caller guarantees they will
+ * do a final flush if necessary */
};
#ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..a4ab9daa387c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,25 @@ enum perf_event_task_context {
perf_nr_task_contexts,
};
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+ /*
+ * Each bit set is a CPU that potentially has a TLB entry for one of
+ * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+ */
+ struct cpumask cpumask;
+
+ /* True if any bit in cpumask is set */
+ bool flush_required;
+
+ /*
+ * If true then the PTE was dirty when unmapped. The entry must be
+ * flushed before IO is initiated or a stale TLB entry potentially
+ * allows an update without redirtying the page.
+ */
+ bool writable;
+};
+
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
@@ -1700,6 +1719,10 @@ struct task_struct {
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
struct rcu_head rcu;
/*
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 912a7c482649..d4c7271382cb 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -149,6 +149,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
#endif
}
+/**
+ * seq_show_options - display mount options with appropriate escapes.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, can be NULL
+ */
+static inline void seq_show_option(struct seq_file *m, const char *name,
+ const char *value)
+{
+ seq_putc(m, ',');
+ seq_escape(m, name, ",= \t\n\\");
+ if (value) {
+ seq_putc(m, '=');
+ seq_escape(m, value, ", \t\n\\");
+ }
+}
+
+/**
+ * seq_show_option_n - display mount options with appropriate escapes
+ * where @value must be a specific length.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, cannot be NULL
+ * @length: the length of @value to display
+ *
+ * This is a macro since this uses "length" to define the size of the
+ * stack buffer.
+ */
+#define seq_show_option_n(m, name, value, length) { \
+ char val_buf[length + 1]; \
+ strncpy(val_buf, value, length); \
+ val_buf[length] = '\0'; \
+ seq_show_option(m, name, val_buf); \
+}
+
#define SEQ_START_TOKEN ((void *)1)
/*
* Helpers for iteration over list_head-s in seq_files
diff --git a/include/linux/slab.h b/include/linux/slab.h
index a99f0e5243e1..7e37d448ed91 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
void kmem_cache_free(struct kmem_cache *, void *);
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node);
void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index da3c593f9845..e6109a6cd8f6 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
const char *thread_comm;
};
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask);
+
+static inline int
+smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ return smpboot_register_percpu_thread_cpumask(plug_thread,
+ cpu_possible_mask);
+}
+
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
const struct cpumask *);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b45c45b8c829..08001317aee7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
asmlinkage long sys_eventfd(unsigned int count);
asmlinkage long sys_eventfd2(unsigned int count, int flags);
asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 000000000000..587480ad41b7
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,85 @@
+/*
+ * include/linux/userfaultfd_k.h
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+
+#ifdef CONFIG_USERFAULTFD
+
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+
+#include <linux/fcntl.h>
+
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags, unsigned long reason);
+
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len);
+
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_UFFD_MISSING;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+
+#else /* CONFIG_USERFAULTFD */
+
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+ unsigned long address,
+ unsigned int flags,
+ unsigned long reason)
+{
+ return VM_FAULT_SIGBUS;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+ struct vm_userfaultfd_ctx vm_ctx)
+{
+ return true;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+ return false;
+}
+
+#endif /* CONFIG_USERFAULTFD */
+
+#endif /* _LINUX_USERFAULTFD_K_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..d3d077228d4c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
typedef int wait_bit_action_f(struct wait_bit_key *);
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key);
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
#define wake_up_poll(x, m) \
__wake_up(x, TASK_NORMAL, 1, (void *) (m))
#define wake_up_locked_poll(x, m) \
- __wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+ __wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
#define wake_up_interruptible_poll(x, m) \
__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
#define wake_up_interruptible_sync_poll(x, m) \
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index f47feada5b42..d74a0e907b9e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
extern int watchdog_register_device(struct watchdog_device *);
extern void watchdog_unregister_device(struct watchdog_device *);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
-#else
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
-#endif
-
#endif /* ifndef _LINUX_WATCHDOG_H */
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index 4250f364a6ca..bc8815f45f3b 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -11,7 +11,8 @@
EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \
EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \
EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \
- EMe( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" )
+ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \
+ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" )
/*
* First define the enums in TLB_FLUSH_REASON to be exported to userspace
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index aafb9937b162..70ff1d9abf0d 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -456,3 +456,4 @@ header-y += xfrm.h
header-y += xilinx-v4l2-controls.h
header-y += zorro.h
header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
# define PR_FP_MODE_FR (1 << 0) /* 64b FP registers */
# define PR_FP_MODE_FRE (1 << 1) /* 32b compatibility */
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+
#endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
#define SECBIT_KEEP_CAPS (issecure_mask(SECURE_KEEP_CAPS))
#define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE 6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED 7 /* make bit-6 immutable */
+
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+ (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
+
#define SECURE_ALL_BITS (issecure_mask(SECURE_NOROOT) | \
issecure_mask(SECURE_NO_SETUID_FIXUP) | \
- issecure_mask(SECURE_KEEP_CAPS))
+ issecure_mask(SECURE_KEEP_CAPS) | \
+ issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
#endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..df0e09bb7dd5
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,169 @@
+/*
+ * include/linux/userfaultfd.h
+ *
+ * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#include <linux/compiler.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ * UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS \
+ ((__u64)1 << _UFFDIO_REGISTER | \
+ (__u64)1 << _UFFDIO_UNREGISTER | \
+ (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS \
+ ((__u64)1 << _UFFDIO_WAKE | \
+ (__u64)1 << _UFFDIO_COPY | \
+ (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F. UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER (0x00)
+#define _UFFDIO_UNREGISTER (0x01)
+#define _UFFDIO_WAKE (0x02)
+#define _UFFDIO_COPY (0x03)
+#define _UFFDIO_ZEROPAGE (0x04)
+#define _UFFDIO_API (0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \
+ struct uffdio_api)
+#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \
+ struct uffdio_register)
+#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \
+ struct uffdio_range)
+#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \
+ struct uffdio_range)
+#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \
+ struct uffdio_copy)
+#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \
+ struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+ __u8 event;
+
+ __u8 reserved1;
+ __u16 reserved2;
+ __u32 reserved3;
+
+ union {
+ struct {
+ __u64 flags;
+ __u64 address;
+ } pagefault;
+
+ struct {
+ /* unused reserved fields */
+ __u64 reserved1;
+ __u64 reserved2;
+ __u64 reserved3;
+ } reserved;
+ } arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT 0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK 0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+ /* userland asks for an API number and the features to enable */
+ __u64 api;
+ /*
+ * Kernel answers below with the all available features for
+ * the API, this notifies userland of which events and/or
+ * which flags for each event are enabled in the current
+ * kernel.
+ *
+ * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+ * are to be considered implicitly always enabled in all kernels as
+ * long as the uffdio_api.api requested matches UFFD_API.
+ */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
+#define UFFD_FEATURE_EVENT_FORK (1<<1)
+#endif
+ __u64 features;
+
+ __u64 ioctls;
+};
+
+struct uffdio_range {
+ __u64 start;
+ __u64 len;
+};
+
+struct uffdio_register {
+ struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
+ __u64 mode;
+
+ /*
+ * kernel answers which ioctl commands are available for the
+ * range, keep at the end as the last 8 bytes aren't read.
+ */
+ __u64 ioctls;
+};
+
+struct uffdio_copy {
+ __u64 dst;
+ __u64 src;
+ __u64 len;
+ /*
+ * There will be a wrprotection flag later that allows to map
+ * pages wrprotected on the fly. And such a flag will be
+ * available if the wrprotection ioctl are implemented for the
+ * range according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "copy" is written by the ioctl and must be at the end: the
+ * copy_from_user will not read the last 8 bytes.
+ */
+ __s64 copy;
+};
+
+struct uffdio_zeropage {
+ struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0)
+ __u64 mode;
+
+ /*
+ * "zeropage" is written by the ioctl and must be at the end:
+ * the copy_from_user will not read the last 8 bytes.
+ */
+ __s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */