From 30aba6656f61ed44cba445a3c0d38b296fa9e8f5 Mon Sep 17 00:00:00 2001
From: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Date: Thu, 23 Aug 2018 17:00:35 -0700
Subject: namei: allow restricted O_CREAT of FIFOs and regular files

Disallows open of FIFOs or regular files not owned by the user in world
writable sticky directories, unless the owner is the same as that of the
directory or the file is opened without the O_CREAT flag.  The purpose
is to make data spoofing attacks harder.  This protection can be turned
on and off separately for FIFOs and regular files via sysctl, just like
the symlinks/hardlinks protection.  This patch is based on Openwall's
"HARDEN_FIFO" feature by Solar Designer.

This is a brief list of old vulnerabilities that could have been prevented
by this feature, some of them even allow for privilege escalation:

CVE-2000-1134
CVE-2007-3852
CVE-2008-0525
CVE-2009-0416
CVE-2011-4834
CVE-2015-1838
CVE-2015-7442
CVE-2016-7489

This list is not meant to be complete.  It's difficult to track down all
vulnerabilities of this kind because they were often reported without any
mention of this particular attack vector.  In fact, before
hardlinks/symlinks restrictions, fifos/regular files weren't the favorite
vehicle to exploit them.

[s.mesoraca16@gmail.com: fix bug reported by Dan Carpenter]
  Link: https://lkml.kernel.org/r/20180426081456.GA7060@mwanda
  Link: http://lkml.kernel.org/r/1524829819-11275-1-git-send-email-s.mesoraca16@gmail.com
[keescook@chromium.org: drop pr_warn_ratelimited() in favor of audit changes in the future]
[keescook@chromium.org: adjust commit subjet]
Link: http://lkml.kernel.org/r/20180416175918.GA13494@beast
Signed-off-by: Salvatore Mesoraca <s.mesoraca16@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Suggested-by: Solar Designer <solar@openwall.com>
Suggested-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5710541183b..33322702c910 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -74,6 +74,8 @@ extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
 extern int sysctl_protected_symlinks;
 extern int sysctl_protected_hardlinks;
+extern int sysctl_protected_fifos;
+extern int sysctl_protected_regular;
 
 typedef __kernel_rwf_t rwf_t;
 
-- 
cgit v1.2.3


From d4ae9916ea2947341180d2b538f48875ff393a86 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 23 Aug 2018 17:00:42 -0700
Subject: mm: soft-offline: close the race against page allocation

A process can be killed with SIGBUS(BUS_MCEERR_AR) when it tries to
allocate a page that was just freed on the way of soft-offline.  This is
undesirable because soft-offline (which is about corrected error) is
less aggressive than hard-offline (which is about uncorrected error),
and we can make soft-offline fail and keep using the page for good
reason like "system is busy."

Two main changes of this patch are:

- setting migrate type of the target page to MIGRATE_ISOLATE. As done
  in free_unref_page_commit(), this makes kernel bypass pcplist when
  freeing the page. So we can assume that the page is in freelist just
  after put_page() returns,

- setting PG_hwpoison on free page under zone->lock which protects
  freelists, so this allows us to avoid setting PG_hwpoison on a page
  that is decided to be allocated soon.

[akpm@linux-foundation.org: tweak set_hwpoison_free_buddy_page() comment]
Link: http://lkml.kernel.org/r/1531452366-11661-3-git-send-email-n-horiguchi@ah.jp.nec.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Xishi Qiu <xishi.qiuxishi@alibaba-inc.com>
Tested-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <zy.zhengyi@alibaba-inc.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h |  5 +++++
 include/linux/swapops.h    | 10 ----------
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 901943e4754b..74bee8cecf4c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -369,8 +369,13 @@ PAGEFLAG_FALSE(Uncached)
 PAGEFLAG(HWPoison, hwpoison, PF_ANY)
 TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
 #define __PG_HWPOISON (1UL << PG_hwpoison)
+extern bool set_hwpoison_free_buddy_page(struct page *page);
 #else
 PAGEFLAG_FALSE(HWPoison)
+static inline bool set_hwpoison_free_buddy_page(struct page *page)
+{
+	return 0;
+}
 #define __PG_HWPOISON 0
 #endif
 
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 1d3877c39a00..568a3553d918 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -340,11 +340,6 @@ static inline int is_hwpoison_entry(swp_entry_t entry)
 	return swp_type(entry) == SWP_HWPOISON;
 }
 
-static inline bool test_set_page_hwpoison(struct page *page)
-{
-	return TestSetPageHWPoison(page);
-}
-
 static inline void num_poisoned_pages_inc(void)
 {
 	atomic_long_inc(&num_poisoned_pages);
@@ -367,11 +362,6 @@ static inline int is_hwpoison_entry(swp_entry_t swp)
 	return 0;
 }
 
-static inline bool test_set_page_hwpoison(struct page *page)
-{
-	return false;
-}
-
 static inline void num_poisoned_pages_inc(void)
 {
 }
-- 
cgit v1.2.3


From 263fade51f7bdd5ad7ebbfe82113a44c5ea4c36c Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 23 Aug 2018 17:01:15 -0700
Subject: docs/mm: make GFP flags descriptions usable as kernel-doc

This patch adds DOC: headings for GFP flag descriptions and adjusts the
formatting to fit sphinx expectations of paragraphs.

Link: http://lkml.kernel.org/r/1532626360-16650-7-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 291 +++++++++++++++++++++++++++-------------------------
 1 file changed, 154 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a6afcec53795..24bcc5eec6b4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -59,29 +59,32 @@ struct vm_area_struct;
 #define __GFP_MOVABLE	((__force gfp_t)___GFP_MOVABLE)  /* ZONE_MOVABLE allowed */
 #define GFP_ZONEMASK	(__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
 
-/*
+/**
+ * DOC: Page mobility and placement hints
+ *
  * Page mobility and placement hints
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
  * These flags provide hints about how mobile the page is. Pages with similar
  * mobility are placed within the same pageblocks to minimise problems due
  * to external fragmentation.
  *
- * __GFP_MOVABLE (also a zone modifier) indicates that the page can be
- *   moved by page migration during memory compaction or can be reclaimed.
+ * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be
+ * moved by page migration during memory compaction or can be reclaimed.
  *
- * __GFP_RECLAIMABLE is used for slab allocations that specify
- *   SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
+ * %__GFP_RECLAIMABLE is used for slab allocations that specify
+ * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers.
  *
- * __GFP_WRITE indicates the caller intends to dirty the page. Where possible,
- *   these pages will be spread between local zones to avoid all the dirty
- *   pages being in one zone (fair zone allocation policy).
+ * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible,
+ * these pages will be spread between local zones to avoid all the dirty
+ * pages being in one zone (fair zone allocation policy).
  *
- * __GFP_HARDWALL enforces the cpuset memory allocation policy.
+ * %__GFP_HARDWALL enforces the cpuset memory allocation policy.
  *
- * __GFP_THISNODE forces the allocation to be satisified from the requested
- *   node with no fallbacks or placement policy enforcements.
+ * %__GFP_THISNODE forces the allocation to be satisified from the requested
+ * node with no fallbacks or placement policy enforcements.
  *
- * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
  */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
@@ -89,54 +92,60 @@ struct vm_area_struct;
 #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
 #define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
 
-/*
+/**
+ * DOC: Watermark modifiers
+ *
  * Watermark modifiers -- controls access to emergency reserves
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
- * __GFP_HIGH indicates that the caller is high-priority and that granting
- *   the request is necessary before the system can make forward progress.
- *   For example, creating an IO context to clean pages.
+ * %__GFP_HIGH indicates that the caller is high-priority and that granting
+ * the request is necessary before the system can make forward progress.
+ * For example, creating an IO context to clean pages.
  *
- * __GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
- *   high priority. Users are typically interrupt handlers. This may be
- *   used in conjunction with __GFP_HIGH
+ * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is
+ * high priority. Users are typically interrupt handlers. This may be
+ * used in conjunction with %__GFP_HIGH
  *
- * __GFP_MEMALLOC allows access to all memory. This should only be used when
- *   the caller guarantees the allocation will allow more memory to be freed
- *   very shortly e.g. process exiting or swapping. Users either should
- *   be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
+ * %__GFP_MEMALLOC allows access to all memory. This should only be used when
+ * the caller guarantees the allocation will allow more memory to be freed
+ * very shortly e.g. process exiting or swapping. Users either should
+ * be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
  *
- * __GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
- *   This takes precedence over the __GFP_MEMALLOC flag if both are set.
+ * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
+ * This takes precedence over the %__GFP_MEMALLOC flag if both are set.
  */
 #define __GFP_ATOMIC	((__force gfp_t)___GFP_ATOMIC)
 #define __GFP_HIGH	((__force gfp_t)___GFP_HIGH)
 #define __GFP_MEMALLOC	((__force gfp_t)___GFP_MEMALLOC)
 #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC)
 
-/*
+/**
+ * DOC: Reclaim modifiers
+ *
  * Reclaim modifiers
+ * ~~~~~~~~~~~~~~~~~
  *
- * __GFP_IO can start physical IO.
+ * %__GFP_IO can start physical IO.
  *
- * __GFP_FS can call down to the low-level FS. Clearing the flag avoids the
- *   allocator recursing into the filesystem which might already be holding
- *   locks.
+ * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the
+ * allocator recursing into the filesystem which might already be holding
+ * locks.
  *
- * __GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
- *   This flag can be cleared to avoid unnecessary delays when a fallback
- *   option is available.
+ * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim.
+ * This flag can be cleared to avoid unnecessary delays when a fallback
+ * option is available.
  *
- * __GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
- *   the low watermark is reached and have it reclaim pages until the high
- *   watermark is reached. A caller may wish to clear this flag when fallback
- *   options are available and the reclaim is likely to disrupt the system. The
- *   canonical example is THP allocation where a fallback is cheap but
- *   reclaim/compaction may cause indirect stalls.
+ * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when
+ * the low watermark is reached and have it reclaim pages until the high
+ * watermark is reached. A caller may wish to clear this flag when fallback
+ * options are available and the reclaim is likely to disrupt the system. The
+ * canonical example is THP allocation where a fallback is cheap but
+ * reclaim/compaction may cause indirect stalls.
  *
- * __GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
+ * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim.
  *
  * The default allocator behavior depends on the request size. We have a concept
- * of so called costly allocations (with order > PAGE_ALLOC_COSTLY_ORDER).
+ * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER).
  * !costly allocations are too essential to fail so they are implicitly
  * non-failing by default (with some exceptions like OOM victims might fail so
  * the caller still has to check for failures) while costly requests try to be
@@ -144,40 +153,40 @@ struct vm_area_struct;
  * The following three modifiers might be used to override some of these
  * implicit rules
  *
- * __GFP_NORETRY: The VM implementation will try only very lightweight
- *   memory direct reclaim to get some memory under memory pressure (thus
- *   it can sleep). It will avoid disruptive actions like OOM killer. The
- *   caller must handle the failure which is quite likely to happen under
- *   heavy memory pressure. The flag is suitable when failure can easily be
- *   handled at small cost, such as reduced throughput
- *
- * __GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
- *   procedures that have previously failed if there is some indication
- *   that progress has been made else where.  It can wait for other
- *   tasks to attempt high level approaches to freeing memory such as
- *   compaction (which removes fragmentation) and page-out.
- *   There is still a definite limit to the number of retries, but it is
- *   a larger limit than with __GFP_NORETRY.
- *   Allocations with this flag may fail, but only when there is
- *   genuinely little unused memory. While these allocations do not
- *   directly trigger the OOM killer, their failure indicates that
- *   the system is likely to need to use the OOM killer soon.  The
- *   caller must handle failure, but can reasonably do so by failing
- *   a higher-level request, or completing it only in a much less
- *   efficient manner.
- *   If the allocation does fail, and the caller is in a position to
- *   free some non-essential memory, doing so could benefit the system
- *   as a whole.
- *
- * __GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
- *   cannot handle allocation failures. The allocation could block
- *   indefinitely but will never return with failure. Testing for
- *   failure is pointless.
- *   New users should be evaluated carefully (and the flag should be
- *   used only when there is no reasonable failure policy) but it is
- *   definitely preferable to use the flag rather than opencode endless
- *   loop around allocator.
- *   Using this flag for costly allocations is _highly_ discouraged.
+ * %__GFP_NORETRY: The VM implementation will try only very lightweight
+ * memory direct reclaim to get some memory under memory pressure (thus
+ * it can sleep). It will avoid disruptive actions like OOM killer. The
+ * caller must handle the failure which is quite likely to happen under
+ * heavy memory pressure. The flag is suitable when failure can easily be
+ * handled at small cost, such as reduced throughput
+ *
+ * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim
+ * procedures that have previously failed if there is some indication
+ * that progress has been made else where.  It can wait for other
+ * tasks to attempt high level approaches to freeing memory such as
+ * compaction (which removes fragmentation) and page-out.
+ * There is still a definite limit to the number of retries, but it is
+ * a larger limit than with %__GFP_NORETRY.
+ * Allocations with this flag may fail, but only when there is
+ * genuinely little unused memory. While these allocations do not
+ * directly trigger the OOM killer, their failure indicates that
+ * the system is likely to need to use the OOM killer soon.  The
+ * caller must handle failure, but can reasonably do so by failing
+ * a higher-level request, or completing it only in a much less
+ * efficient manner.
+ * If the allocation does fail, and the caller is in a position to
+ * free some non-essential memory, doing so could benefit the system
+ * as a whole.
+ *
+ * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller
+ * cannot handle allocation failures. The allocation could block
+ * indefinitely but will never return with failure. Testing for
+ * failure is pointless.
+ * New users should be evaluated carefully (and the flag should be
+ * used only when there is no reasonable failure policy) but it is
+ * definitely preferable to use the flag rather than opencode endless
+ * loop around allocator.
+ * Using this flag for costly allocations is _highly_ discouraged.
  */
 #define __GFP_IO	((__force gfp_t)___GFP_IO)
 #define __GFP_FS	((__force gfp_t)___GFP_FS)
@@ -188,14 +197,17 @@ struct vm_area_struct;
 #define __GFP_NOFAIL	((__force gfp_t)___GFP_NOFAIL)
 #define __GFP_NORETRY	((__force gfp_t)___GFP_NORETRY)
 
-/*
+/**
+ * DOC: Action modifiers
+ *
  * Action modifiers
+ * ~~~~~~~~~~~~~~~~
  *
- * __GFP_NOWARN suppresses allocation failure reports.
+ * %__GFP_NOWARN suppresses allocation failure reports.
  *
- * __GFP_COMP address compound page metadata.
+ * %__GFP_COMP address compound page metadata.
  *
- * __GFP_ZERO returns a zeroed page on success.
+ * %__GFP_ZERO returns a zeroed page on success.
  */
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
@@ -208,66 +220,71 @@ struct vm_area_struct;
 #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
-/*
+/**
+ * DOC: Useful GFP flag combinations
+ *
+ * Useful GFP flag combinations
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
  * Useful GFP flag combinations that are commonly used. It is recommended
  * that subsystems start with one of these combinations and then set/clear
- * __GFP_FOO flags as necessary.
- *
- * GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
- *   watermark is applied to allow access to "atomic reserves"
- *
- * GFP_KERNEL is typical for kernel-internal allocations. The caller requires
- *   ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
- *
- * GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
- *   accounted to kmemcg.
- *
- * GFP_NOWAIT is for kernel allocations that should not stall for direct
- *   reclaim, start physical IO or use any filesystem callback.
- *
- * GFP_NOIO will use direct reclaim to discard clean pages or slab pages
- *   that do not require the starting of any physical IO.
- *   Please try to avoid using this flag directly and instead use
- *   memalloc_noio_{save,restore} to mark the whole scope which cannot
- *   perform any IO with a short explanation why. All allocation requests
- *   will inherit GFP_NOIO implicitly.
- *
- * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
- *   Please try to avoid using this flag directly and instead use
- *   memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
- *   recurse into the FS layer with a short explanation why. All allocation
- *   requests will inherit GFP_NOFS implicitly.
- *
- * GFP_USER is for userspace allocations that also need to be directly
- *   accessibly by the kernel or hardware. It is typically used by hardware
- *   for buffers that are mapped to userspace (e.g. graphics) that hardware
- *   still must DMA to. cpuset limits are enforced for these allocations.
- *
- * GFP_DMA exists for historical reasons and should be avoided where possible.
- *   The flags indicates that the caller requires that the lowest zone be
- *   used (ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
- *   it would require careful auditing as some users really require it and
- *   others use the flag to avoid lowmem reserves in ZONE_DMA and treat the
- *   lowest zone as a type of emergency reserve.
- *
- * GFP_DMA32 is similar to GFP_DMA except that the caller requires a 32-bit
- *   address.
- *
- * GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
- *   do not need to be directly accessible by the kernel but that cannot
- *   move once in use. An example may be a hardware allocation that maps
- *   data directly into userspace but has no addressing limitations.
- *
- * GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
- *   need direct access to but can use kmap() when access is required. They
- *   are expected to be movable via page reclaim or page migration. Typically,
- *   pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
- *
- * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are
- *   compound allocations that will generally fail quickly if memory is not
- *   available and will not wake kswapd/kcompactd on failure. The _LIGHT
- *   version does not attempt reclaim/compaction at all and is by default used
- *   in page fault path, while the non-light is used by khugepaged.
+ * %__GFP_FOO flags as necessary.
+ *
+ * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower
+ * watermark is applied to allow access to "atomic reserves"
+ *
+ * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires
+ * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim.
+ *
+ * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is
+ * accounted to kmemcg.
+ *
+ * %GFP_NOWAIT is for kernel allocations that should not stall for direct
+ * reclaim, start physical IO or use any filesystem callback.
+ *
+ * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages
+ * that do not require the starting of any physical IO.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_noio_{save,restore} to mark the whole scope which cannot
+ * perform any IO with a short explanation why. All allocation requests
+ * will inherit GFP_NOIO implicitly.
+ *
+ * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces.
+ * Please try to avoid using this flag directly and instead use
+ * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't
+ * recurse into the FS layer with a short explanation why. All allocation
+ * requests will inherit GFP_NOFS implicitly.
+ *
+ * %GFP_USER is for userspace allocations that also need to be directly
+ * accessibly by the kernel or hardware. It is typically used by hardware
+ * for buffers that are mapped to userspace (e.g. graphics) that hardware
+ * still must DMA to. cpuset limits are enforced for these allocations.
+ *
+ * %GFP_DMA exists for historical reasons and should be avoided where possible.
+ * The flags indicates that the caller requires that the lowest zone be
+ * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but
+ * it would require careful auditing as some users really require it and
+ * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the
+ * lowest zone as a type of emergency reserve.
+ *
+ * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
+ * address.
+ *
+ * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
+ * do not need to be directly accessible by the kernel but that cannot
+ * move once in use. An example may be a hardware allocation that maps
+ * data directly into userspace but has no addressing limitations.
+ *
+ * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not
+ * need direct access to but can use kmap() when access is required. They
+ * are expected to be movable via page reclaim or page migration. Typically,
+ * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE.
+ *
+ * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They
+ * are compound allocations that will generally fail quickly if memory is not
+ * available and will not wake kswapd/kcompactd on failure. The _LIGHT
+ * version does not attempt reclaim/compaction at all and is by default used
+ * in page fault path, while the non-light is used by khugepaged.
  */
 #define GFP_ATOMIC	(__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
 #define GFP_KERNEL	(__GFP_RECLAIM | __GFP_IO | __GFP_FS)
-- 
cgit v1.2.3


From 2b7403035459c75e193c6b04a293e518a4212de0 Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Thu, 23 Aug 2018 17:01:36 -0700
Subject: mm: Change return type int to vm_fault_t for fault handlers

Use new return type vm_fault_t for fault handler.  For now, this is just
documenting that the function returns a VM_FAULT value rather than an
errno.  Once all instances are converted, vm_fault_t will become a
distinct type.

Ref-> commit 1c8f422059ae ("mm: change return type to vm_fault_t")

The aim is to change the return type of finish_fault() and
handle_mm_fault() to vm_fault_t type.  As part of that clean up return
type of all other recursively called functions have been changed to
vm_fault_t type.

The places from where handle_mm_fault() is getting invoked will be
change to vm_fault_t type but in a separate patch.

vmf_error() is the newly introduce inline function in 4.17-rc6.

[akpm@linux-foundation.org: don't shadow outer local `ret' in __do_huge_pmd_anonymous_page()]
Link: http://lkml.kernel.org/r/20180604171727.GA20279@jordon-HP-15-Notebook-PC
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h       |  9 +++++----
 include/linux/hugetlb.h       |  2 +-
 include/linux/mm.h            | 14 +++++++-------
 include/linux/oom.h           |  2 +-
 include/linux/swapops.h       |  5 +++--
 include/linux/userfaultfd_k.h |  5 +++--
 6 files changed, 20 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a8a126259bc4..27e3e32135a8 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -6,7 +6,7 @@
 
 #include <linux/fs.h> /* only for vma_is_dax() */
 
-extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+extern vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
 extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 			 struct vm_area_struct *vma);
@@ -23,7 +23,7 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
 }
 #endif
 
-extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
+extern vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
 extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 					  unsigned long addr,
 					  pmd_t *pmd,
@@ -216,7 +216,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 		pud_t *pud, int flags);
 
-extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
+extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 extern struct page *huge_zero_page;
 
@@ -321,7 +321,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
 	return NULL;
 }
 
-static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf,
+		pmd_t orig_pmd)
 {
 	return 0;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c39d9170a8a0..6b68e345f0ca 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -105,7 +105,7 @@ void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
 void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
 int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
 				struct vm_area_struct *dst_vma,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a9e733b5fb76..8fcc36660de6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -728,10 +728,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
-int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
 		struct page *page);
-int finish_fault(struct vm_fault *vmf);
-int finish_mkwrite_fault(struct vm_fault *vmf);
+vm_fault_t finish_fault(struct vm_fault *vmf);
+vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
 #endif
 
 /*
@@ -1403,8 +1403,8 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page);
 int invalidate_inode_page(struct page *page);
 
 #ifdef CONFIG_MMU
-extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
-		unsigned int flags);
+extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
+			unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 			    unsigned long address, unsigned int fault_flags,
 			    bool *unlocked);
@@ -1413,7 +1413,7 @@ void unmap_mapping_pages(struct address_space *mapping,
 void unmap_mapping_range(struct address_space *mapping,
 		loff_t const holebegin, loff_t const holelen, int even_cows);
 #else
-static inline int handle_mm_fault(struct vm_area_struct *vma,
+static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags)
 {
 	/* should never happen if there's no MMU */
@@ -2563,7 +2563,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
 #define FOLL_COW	0x4000	/* internal GUP flag */
 #define FOLL_ANON	0x8000	/* don't do file mappings */
 
-static inline int vm_fault_to_errno(int vm_fault, int foll_flags)
+static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
 {
 	if (vm_fault & VM_FAULT_OOM)
 		return -ENOMEM;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 92f70e4c6252..69864a547663 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -88,7 +88,7 @@ static inline bool mm_is_oom_victim(struct mm_struct *mm)
  *
  * Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
  */
-static inline int check_stable_address_space(struct mm_struct *mm)
+static inline vm_fault_t check_stable_address_space(struct mm_struct *mm)
 {
 	if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
 		return VM_FAULT_SIGBUS;
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 568a3553d918..22af9d8a84ae 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -4,6 +4,7 @@
 
 #include <linux/radix-tree.h>
 #include <linux/bug.h>
+#include <linux/mm_types.h>
 
 /*
  * swapcache pages are stored in the swapper_space radix tree.  We want to
@@ -134,7 +135,7 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 	return pfn_to_page(swp_offset(entry));
 }
 
-int device_private_entry_fault(struct vm_area_struct *vma,
+vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 		       unsigned long addr,
 		       swp_entry_t entry,
 		       unsigned int flags,
@@ -169,7 +170,7 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 	return NULL;
 }
 
-static inline int device_private_entry_fault(struct vm_area_struct *vma,
+static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 				     unsigned long addr,
 				     swp_entry_t entry,
 				     unsigned int flags,
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e091f0a11b11..37c9eba75c98 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -28,7 +28,7 @@
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
-extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
+extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
 			    unsigned long src_start, unsigned long len,
@@ -77,7 +77,8 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
 #else /* CONFIG_USERFAULTFD */
 
 /* mm helpers */
-static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason)
+static inline vm_fault_t handle_userfault(struct vm_fault *vmf,
+				unsigned long reason)
 {
 	return VM_FAULT_SIGBUS;
 }
-- 
cgit v1.2.3