diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-11-02 09:33:08 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-11-02 09:33:08 -0700 |
commit | c2aa1a444cab2c673650ada80a7dffc4345ce2e6 (patch) | |
tree | 3efb7e2213cabd174780b021a8dab2cea0b03386 /mm | |
parent | b69f9e17a57a50bc34d88975afce4425086e525d (diff) | |
parent | bf4a1fcf0bc18d52cf0fce6571d6f327ab5eaf22 (diff) |
Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull vfs dedup fixes from Dave Chinner:
"This reworks the vfs data cloning infrastructure.
We discovered many issues with these interfaces late in the 4.19 cycle
- the worst of them (data corruption, setuid stripping) were fixed for
XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all
the problems was needed. That rework is the contents of this pull
request.
Rework the vfs_clone_file_range and vfs_dedupe_file_range
infrastructure to use a common .remap_file_range method and supply
generic bounds and sanity checking functions that are shared with the
data write path. The current VFS infrastructure has problems with
rlimit, LFS file sizes, file time stamps, maximum filesystem file
sizes, stripping setuid bits, etc and so they are addressed in these
commits.
We also introduce the ability for the ->remap_file_range methods to
return short clones so that clones for vfs_copy_file_range() don't get
rejected if the entire range can't be cloned. It also allows
filesystems to sliently skip deduplication of partial EOF blocks if
they are not capable of doing so without requiring errors to be thrown
to userspace.
Existing filesystems are converted to user the new remap_file_range
method, and both XFS and ocfs2 are modified to make use of the new
generic checking infrastructure"
* tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits)
xfs: remove [cm]time update from reflink calls
xfs: remove xfs_reflink_remap_range
xfs: remove redundant remap partial EOF block checks
xfs: support returning partial reflink results
xfs: clean up xfs_reflink_remap_blocks call site
xfs: fix pagecache truncation prior to reflink
ocfs2: remove ocfs2_reflink_remap_range
ocfs2: support partial clone range and dedupe range
ocfs2: fix pagecache truncation prior to reflink
ocfs2: truncate page cache for clone destination file before remapping
vfs: clean up generic_remap_file_range_prep return value
vfs: hide file range comparison function
vfs: enable remap callers that can handle short operations
vfs: plumb remap flags through the vfs dedupe functions
vfs: plumb remap flags through the vfs clone functions
vfs: make remap_file_range functions take and return bytes completed
vfs: remap helper should update destination inode metadata
vfs: pass remap flags to generic_remap_checks
vfs: pass remap flags to generic_remap_file_range_prep
vfs: combine the clone and dedupe into a single remap_file_range
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 146 |
1 files changed, 119 insertions, 27 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 1fe6c4c37a35..81adec8ee02c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2825,6 +2825,42 @@ struct page *read_cache_page_gfp(struct address_space *mapping, EXPORT_SYMBOL(read_cache_page_gfp); /* + * Don't operate on ranges the page cache doesn't support, and don't exceed the + * LFS limits. If pos is under the limit it becomes a short access. If it + * exceeds the limit we return -EFBIG. + */ +static int generic_access_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + struct inode *inode = file->f_mapping->host; + loff_t max_size = inode->i_sb->s_maxbytes; + + if (!(file->f_flags & O_LARGEFILE)) + max_size = MAX_NON_LFS; + + if (unlikely(pos >= max_size)) + return -EFBIG; + *count = min(*count, max_size - pos); + return 0; +} + +static int generic_write_check_limits(struct file *file, loff_t pos, + loff_t *count) +{ + loff_t limit = rlimit(RLIMIT_FSIZE); + + if (limit != RLIM_INFINITY) { + if (pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + *count = min(*count, limit - pos); + } + + return generic_access_check_limits(file, pos, count); +} + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -2835,8 +2871,8 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - unsigned long limit = rlimit(RLIMIT_FSIZE); - loff_t pos; + loff_t count; + int ret; if (!iov_iter_count(from)) return 0; @@ -2845,43 +2881,99 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - pos = iocb->ki_pos; - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) return -EINVAL; - if (limit != RLIM_INFINITY) { - if (iocb->ki_pos >= limit) { - send_sig(SIGXFSZ, current, 0); - return -EFBIG; - } - iov_iter_truncate(from, limit - (unsigned long)pos); - } + count = iov_iter_count(from); + ret = generic_write_check_limits(file, iocb->ki_pos, &count); + if (ret) + return ret; + + iov_iter_truncate(from, count); + return iov_iter_count(from); +} +EXPORT_SYMBOL(generic_write_checks); + +/* + * Performs necessary checks before doing a clone. + * + * Can adjust amount of bytes to clone. + * Returns appropriate error code that caller should return or + * zero in case the clone should be allowed. + */ +int generic_remap_checks(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *req_count, unsigned int remap_flags) +{ + struct inode *inode_in = file_in->f_mapping->host; + struct inode *inode_out = file_out->f_mapping->host; + uint64_t count = *req_count; + uint64_t bcount; + loff_t size_in, size_out; + loff_t bs = inode_out->i_sb->s_blocksize; + int ret; + + /* The start of both ranges must be aligned to an fs block. */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) + return -EINVAL; + + /* Ensure offsets don't wrap. */ + if (pos_in + count < pos_in || pos_out + count < pos_out) + return -EINVAL; + + size_in = i_size_read(inode_in); + size_out = i_size_read(inode_out); + + /* Dedupe requires both ranges to be within EOF. */ + if ((remap_flags & REMAP_FILE_DEDUP) && + (pos_in >= size_in || pos_in + count > size_in || + pos_out >= size_out || pos_out + count > size_out)) + return -EINVAL; + + /* Ensure the infile range is within the infile. */ + if (pos_in >= size_in) + return -EINVAL; + count = min(count, size_in - (uint64_t)pos_in); + + ret = generic_access_check_limits(file_in, pos_in, &count); + if (ret) + return ret; + + ret = generic_write_check_limits(file_out, pos_out, &count); + if (ret) + return ret; /* - * LFS rule + * If the user wanted us to link to the infile's EOF, round up to the + * next block boundary for this check. + * + * Otherwise, make sure the count is also block-aligned, having + * already confirmed the starting offsets' block alignment. */ - if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && - !(file->f_flags & O_LARGEFILE))) { - if (pos >= MAX_NON_LFS) - return -EFBIG; - iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); + if (pos_in + count == size_in) { + bcount = ALIGN(size_in, bs) - pos_in; + } else { + if (!IS_ALIGNED(count, bs)) + count = ALIGN_DOWN(count, bs); + bcount = count; } + /* Don't allow overlapped cloning within the same file. */ + if (inode_in == inode_out && + pos_out + bcount > pos_in && + pos_out < pos_in + bcount) + return -EINVAL; + /* - * Are we about to exceed the fs block limit ? - * - * If we have written data it becomes a short write. If we have - * exceeded without writing data we send a signal and return EFBIG. - * Linus frestrict idea will clean these up nicely.. + * We shortened the request but the caller can't deal with that, so + * bounce the request back to userspace. */ - if (unlikely(pos >= inode->i_sb->s_maxbytes)) - return -EFBIG; + if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) + return -EINVAL; - iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); - return iov_iter_count(from); + *req_count = count; + return 0; } -EXPORT_SYMBOL(generic_write_checks); int pagecache_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, |