From d1908f52557b3230fbd63c0429f3b4b748bf2b6d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 3 Feb 2017 13:13:26 -0800
Subject: fs: break out of iomap_file_buffered_write on fatal signals

Tetsuo has noticed that an OOM stress test which performs large write
requests can cause the full memory reserves depletion.  He has tracked
this down to the following path

	__alloc_pages_nodemask+0x436/0x4d0
	alloc_pages_current+0x97/0x1b0
	__page_cache_alloc+0x15d/0x1a0          mm/filemap.c:728
	pagecache_get_page+0x5a/0x2b0           mm/filemap.c:1331
	grab_cache_page_write_begin+0x23/0x40   mm/filemap.c:2773
	iomap_write_begin+0x50/0xd0             fs/iomap.c:118
	iomap_write_actor+0xb5/0x1a0            fs/iomap.c:190
	? iomap_write_end+0x80/0x80             fs/iomap.c:150
	iomap_apply+0xb3/0x130                  fs/iomap.c:79
	iomap_file_buffered_write+0x68/0xa0     fs/iomap.c:243
	? iomap_write_end+0x80/0x80
	xfs_file_buffered_aio_write+0x132/0x390 [xfs]
	? remove_wait_queue+0x59/0x60
	xfs_file_write_iter+0x90/0x130 [xfs]
	__vfs_write+0xe5/0x140
	vfs_write+0xc7/0x1f0
	? syscall_trace_enter+0x1d0/0x380
	SyS_write+0x58/0xc0
	do_syscall_64+0x6c/0x200
	entry_SYSCALL64_slow_path+0x25/0x25

the oom victim has access to all memory reserves to make a forward
progress to exit easier.  But iomap_file_buffered_write and other
callers of iomap_apply loop to complete the full request.  We need to
check for fatal signals and back off with a short write instead.

As the iomap_apply delegates all the work down to the actor we have to
hook into those.  All callers that work with the page cache are calling
iomap_write_begin so we will check for signals there.  dax_iomap_actor
has to handle the situation explicitly because it copies data to the
userspace directly.  Other callers like iomap_page_mkwrite work on a
single page or iomap_fiemap_actor do not allocate memory based on the
given len.

Fixes: 68a9f5e7007c ("xfs: implement iomap based buffered write path")
Link: http://lkml.kernel.org/r/20170201092706.9966-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>	[4.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c   | 5 +++++
 fs/iomap.c | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/dax.c b/fs/dax.c
index 3af2da5e64ce..c45598b912e1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1031,6 +1031,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct blk_dax_ctl dax = { 0 };
 		ssize_t map_len;
 
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
 		dax.sector = dax_iomap_sector(iomap, pos);
 		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
 		map_len = dax_map_atomic(iomap->bdev, &dax);
diff --git a/fs/iomap.c b/fs/iomap.c
index 354a123f170e..a51cb4c07d4d 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -114,6 +114,9 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 
 	BUG_ON(pos + len > iomap->offset + iomap->length);
 
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
 	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
-- 
cgit v1.2.3


From b6789123bccba8b5feb9901ed2e8c3c39181979d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 7 Feb 2017 11:11:16 -0800
Subject: mm: fix KPF_SWAPCACHE in /proc/kpageflags

Commit 6326fec1122c ("mm: Use owner_priv bit for PageSwapCache, valid
when PageSwapBacked") aliased PG_swapcache to PG_owner_priv_1 (and
depending on PageSwapBacked being true).

As a result, the KPF_SWAPCACHE bit in '/proc/kpageflags' should now be
synthesized, instead of being shown on unrelated pages which just happen
to have PG_owner_priv_1 set.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index a2066e6dee90..2726536489b1 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -173,7 +173,8 @@ u64 stable_page_flags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_ACTIVE,	PG_active);
 	u |= kpf_copy_bit(k, KPF_RECLAIM,	PG_reclaim);
 
-	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
+	if (PageSwapCache(page))
+		u |= 1 << KPF_SWAPCACHE;
 	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
 
 	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
-- 
cgit v1.2.3


From 2a362249187a8d0f6d942d6e1d763d150a296f47 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Mon, 6 Feb 2017 19:39:09 -0500
Subject: btrfs: fix btrfs_compat_ioctl failures on non-compat ioctls

Commit 4c63c2454ef incorrectly assumed that returning -ENOIOCTLCMD would
cause the native ioctl to be called.  The ->compat_ioctl callback is
expected to handle all ioctls, not just compat variants.  As a result,
when using 32-bit userspace on 64-bit kernels, everything except those
three ioctls would return -ENOTTY.

Fixes: 4c63c2454ef ("btrfs: bugfix: handle FS_IOC32_{GETFLAGS,SETFLAGS,GETVERSION} in btrfs_ioctl")
Cc: stable@vger.kernel.org
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0a6902555e65..0c77cad89e87 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5665,6 +5665,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 #ifdef CONFIG_COMPAT
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	/*
+	 * These all access 32-bit values anyway so no further
+	 * handling is necessary.
+	 */
 	switch (cmd) {
 	case FS_IOC32_GETFLAGS:
 		cmd = FS_IOC_GETFLAGS;
@@ -5675,8 +5679,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FS_IOC32_GETVERSION:
 		cmd = FS_IOC_GETVERSION;
 		break;
-	default:
-		return -ENOIOCTLCMD;
 	}
 
 	return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
-- 
cgit v1.2.3


From 8672aed7bd865774257efd40929702759a869329 Mon Sep 17 00:00:00 2001
From: Brian Norris <briannorris@chromium.org>
Date: Wed, 8 Feb 2017 22:44:44 -0800
Subject: pstore: don't OOPS when there are no ftrace zones

We'll OOPS in ramoops_get_next_prz() if the platform didn't ask for any
ftrace zones (i.e., cxt->fprzs will be NULL). Let's just skip this
entire FTRACE section if there's no 'fprzs'.

Regression seen on a coreboot/depthcharge-based Chromebook.

Fixes: 2fbea82bbb89 ("pstore: Merge per-CPU ftrace records into one")
Cc: Joel Fernandes <joelaf@google.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Brian Norris <briannorris@chromium.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/pstore/ram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 27c059e1760a..1d887efaaf71 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -280,7 +280,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 					   1, id, type, PSTORE_TYPE_PMSG, 0);
 
 	/* ftrace is last since it may want to dynamically allocate memory. */
-	if (!prz_ok(prz)) {
+	if (!prz_ok(prz) && cxt->fprzs) {
 		if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
 			prz = ramoops_get_next_prz(cxt->fprzs,
 					&cxt->ftrace_read_cnt, 1, id, type,
-- 
cgit v1.2.3


From 0839ffb83e44e5ff1843e932592525fc2bff23ff Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 9 Feb 2017 14:20:42 -0500
Subject: nfsd: Revert "nfsd: special case truncates some more"

This patch incorrectly attempted nested mnt_want_write, and incorrectly
disabled nfsd's owner override for truncate.  We'll fix those problems
and make another attempt soon, for the moment I think the safest is to
revert.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 97 ++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 60 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ca13236dbb1f..26c6fdb4bf67 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -332,6 +332,37 @@ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
 	}
 }
 
+static __be32
+nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+		struct iattr *iap)
+{
+	struct inode *inode = d_inode(fhp->fh_dentry);
+	int host_err;
+
+	if (iap->ia_size < inode->i_size) {
+		__be32 err;
+
+		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+				NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+		if (err)
+			return err;
+	}
+
+	host_err = get_write_access(inode);
+	if (host_err)
+		goto out_nfserrno;
+
+	host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+	if (host_err)
+		goto out_put_write_access;
+	return 0;
+
+out_put_write_access:
+	put_write_access(inode);
+out_nfserrno:
+	return nfserrno(host_err);
+}
+
 /*
  * Set various file attributes.  After this call fhp needs an fh_put.
  */
@@ -346,6 +377,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	__be32		err;
 	int		host_err;
 	bool		get_write_count;
+	int		size_change = 0;
 
 	if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
 		accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
@@ -358,11 +390,11 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 	/* Get inode */
 	err = fh_verify(rqstp, fhp, ftype, accmode);
 	if (err)
-		return err;
+		goto out;
 	if (get_write_count) {
 		host_err = fh_want_write(fhp);
 		if (host_err)
-			goto out_host_err;
+			return nfserrno(host_err);
 	}
 
 	dentry = fhp->fh_dentry;
@@ -373,59 +405,50 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 		iap->ia_valid &= ~ATTR_MODE;
 
 	if (!iap->ia_valid)
-		return 0;
+		goto out;
 
 	nfsd_sanitize_attrs(inode, iap);
 
-	if (check_guard && guardtime != inode->i_ctime.tv_sec)
-		return nfserr_notsync;
-
 	/*
 	 * The size case is special, it changes the file in addition to the
-	 * attributes, and file systems don't expect it to be mixed with
-	 * "random" attribute changes.  We thus split out the size change
-	 * into a separate call for vfs_truncate, and do the rest as a
-	 * a separate setattr call.
+	 * attributes.
 	 */
 	if (iap->ia_valid & ATTR_SIZE) {
-		struct path path = {
-			.mnt	= fhp->fh_export->ex_path.mnt,
-			.dentry	= dentry,
-		};
-		bool implicit_mtime = false;
+		err = nfsd_get_write_access(rqstp, fhp, iap);
+		if (err)
+			goto out;
+		size_change = 1;
 
 		/*
-		 * vfs_truncate implicity updates the mtime IFF the file size
-		 * actually changes.  Avoid the additional seattr call below if
-		 * the only other attribute that the client sends is the mtime.
+		 * RFC5661, Section 18.30.4:
+		 *   Changing the size of a file with SETATTR indirectly
+		 *   changes the time_modify and change attributes.
+		 *
+		 * (and similar for the older RFCs)
 		 */
-		if (iap->ia_size != i_size_read(inode) &&
-		    ((iap->ia_valid & ~(ATTR_SIZE | ATTR_MTIME)) == 0))
-			implicit_mtime = true;
-
-		host_err = vfs_truncate(&path, iap->ia_size);
-		if (host_err)
-			goto out_host_err;
-
-		iap->ia_valid &= ~ATTR_SIZE;
-		if (implicit_mtime)
-			iap->ia_valid &= ~ATTR_MTIME;
-		if (!iap->ia_valid)
-			goto done;
+		if (iap->ia_size != i_size_read(inode))
+			iap->ia_valid |= ATTR_MTIME;
 	}
 
 	iap->ia_valid |= ATTR_CTIME;
 
+	if (check_guard && guardtime != inode->i_ctime.tv_sec) {
+		err = nfserr_notsync;
+		goto out_put_write_access;
+	}
+
 	fh_lock(fhp);
 	host_err = notify_change(dentry, iap, NULL);
 	fh_unlock(fhp);
-	if (host_err)
-		goto out_host_err;
+	err = nfserrno(host_err);
 
-done:
-	host_err = commit_metadata(fhp);
-out_host_err:
-	return nfserrno(host_err);
+out_put_write_access:
+	if (size_change)
+		put_write_access(inode);
+	if (!err)
+		err = nfserrno(commit_metadata(fhp));
+out:
+	return err;
 }
 
 #if defined(CONFIG_NFSD_V4)
-- 
cgit v1.2.3


From 6e78b3f7a193546b1c00a6d084596e774f147169 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 10 Feb 2017 15:03:35 -0800
Subject: Btrfs: fix btrfs_decompress_buf2page()

If btrfs_decompress_buf2page() is handed a bio with its page in the
middle of the working buffer, then we adjust the offset into the working
buffer. After we copy into the bio, we advance the iterator by the
number of bytes we copied. Then, we have some logic to handle the case
of discontiguous pages and adjust the offset into the working buffer
again. However, if we didn't advance the bio to a new page, we may enter
this case in error, essentially repeating the adjustment that we already
made when we entered the function. The end result is bogus data in the
bio.

Previously, we only checked for this case when we advanced to a new
page, but the conversion to bio iterators changed that. This restores
the old, correct behavior.

A case I saw when testing with zlib was:

    buf_start = 42769
    total_out = 46865
    working_bytes = total_out - buf_start = 4096
    start_byte = 45056

The condition (total_out > start_byte && buf_start < start_byte) is
true, so we adjust the offset:

    buf_offset = start_byte - buf_start = 2287
    working_bytes -= buf_offset = 1809
    current_buf_start = buf_start = 42769

Then, we copy

    bytes = min(bvec.bv_len, PAGE_SIZE - buf_offset, working_bytes) = 1809
    buf_offset += bytes = 4096
    working_bytes -= bytes = 0
    current_buf_start += bytes = 44578

After bio_advance(), we are still in the same page, so start_byte is the
same. Then, we check (total_out > start_byte && current_buf_start < start_byte),
which is true! So, we adjust the values again:

    buf_offset = start_byte - buf_start = 2287
    working_bytes = total_out - start_byte = 1809
    current_buf_start = buf_start + buf_offset = 45056

But note that working_bytes was already zero before this, so we should
have stopped copying.

Fixes: 974b1adc3b10 ("btrfs: use bio iterators for the decompression handlers")
Reported-by: Pat Erley <pat-lkml@erley.org>
Reviewed-by: Chris Mason <clm@fb.com>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Tested-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/compression.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7f390849343b..c4444d6f439f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1024,6 +1024,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 	unsigned long buf_offset;
 	unsigned long current_buf_start;
 	unsigned long start_byte;
+	unsigned long prev_start_byte;
 	unsigned long working_bytes = total_out - buf_start;
 	unsigned long bytes;
 	char *kaddr;
@@ -1071,26 +1072,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 		if (!bio->bi_iter.bi_size)
 			return 0;
 		bvec = bio_iter_iovec(bio, bio->bi_iter);
-
+		prev_start_byte = start_byte;
 		start_byte = page_offset(bvec.bv_page) - disk_start;
 
 		/*
-		 * make sure our new page is covered by this
-		 * working buffer
+		 * We need to make sure we're only adjusting
+		 * our offset into compression working buffer when
+		 * we're switching pages.  Otherwise we can incorrectly
+		 * keep copying when we were actually done.
 		 */
-		if (total_out <= start_byte)
-			return 1;
+		if (start_byte != prev_start_byte) {
+			/*
+			 * make sure our new page is covered by this
+			 * working buffer
+			 */
+			if (total_out <= start_byte)
+				return 1;
 
-		/*
-		 * the next page in the biovec might not be adjacent
-		 * to the last page, but it might still be found
-		 * inside this working buffer. bump our offset pointer
-		 */
-		if (total_out > start_byte &&
-		    current_buf_start < start_byte) {
-			buf_offset = start_byte - buf_start;
-			working_bytes = total_out - start_byte;
-			current_buf_start = buf_start + buf_offset;
+			/*
+			 * the next page in the biovec might not be adjacent
+			 * to the last page, but it might still be found
+			 * inside this working buffer. bump our offset pointer
+			 */
+			if (total_out > start_byte &&
+			    current_buf_start < start_byte) {
+				buf_offset = start_byte - buf_start;
+				working_bytes = total_out - start_byte;
+				current_buf_start = buf_start + buf_offset;
+			}
 		}
 	}
 
-- 
cgit v1.2.3