diff options
author | Stefan Agner <stefan.agner@toradex.com> | 2019-11-12 13:57:30 +0100 |
---|---|---|
committer | Stefan Agner <stefan.agner@toradex.com> | 2019-11-12 13:57:30 +0100 |
commit | 401bf3f29b1aa6d9ca32bd3252fc9beabe93d80b (patch) | |
tree | 6dd86325823a9b44b6e696acbd077e68c47d3108 /fs | |
parent | 5a9c845fe261dec87f892f3fd3c2e32604d952c0 (diff) | |
parent | b260a0862e3a9fccdac23ec3b783911b098c1c74 (diff) |
Merge tag 'v5.3.10' into toradex_5.3.ytoradex_5.3.y
This is the 5.3.10 stable release
Diffstat (limited to 'fs')
42 files changed, 620 insertions, 402 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f131651502b8..c62903290f3a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -899,7 +899,7 @@ out_free_interp: the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; + int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -931,13 +931,6 @@ out_free_interp: */ } } - - /* - * Some binaries have overlapping elf segments and then - * we have to forcefully map over an existing mapping - * e.g. over this newly established brk mapping. - */ - elf_fixed = MAP_FIXED; } elf_prot = make_prot(elf_ppnt->p_flags); @@ -950,7 +943,7 @@ out_free_interp: * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -986,7 +979,7 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else load_bias = 0; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d9541d58ce3d..180749080fd8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -908,8 +908,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; - /* the extent workers do delayed refs on the extent allocation tree */ - struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; u32 thread_pool_size; @@ -2760,8 +2758,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 17f7c0d38768..8d2bb28ff5e0 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -371,7 +371,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) out_qgroup: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); out_fail: - btrfs_inode_rsv_release(inode, true); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return ret; @@ -408,7 +407,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, * btrfs_delalloc_release_extents - release our outstanding_extents * @inode: the inode to balance the reservation for. * @num_bytes: the number of bytes we originally reserved with - * @qgroup_free: do we need to free qgroup meta reservation or convert them. * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -416,8 +414,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, * temporarily tracked outstanding_extents. This _must_ be used in conjunction * with btrfs_delalloc_reserve_metadata. */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned num_extents; @@ -431,7 +428,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, if (btrfs_is_testing(fs_info)) return; - btrfs_inode_rsv_release(inode, qgroup_free); + btrfs_inode_rsv_release(inode, true); } /** diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65af7eb3f7bd..46eac7ddf0f7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2036,7 +2036,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); - btrfs_destroy_workqueue(fs_info->extent_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work @@ -2242,10 +2241,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue(fs_info, "extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->submit_workers && fs_info->flush_workers && @@ -2256,7 +2251,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { return -ENOMEM; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ef2f80825c82..d5a3a66c8f1d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8117,6 +8117,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_err(info, "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", cache->key.objectid); + btrfs_put_block_group(cache); ret = -EINVAL; goto error; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 58a18ed11546..a8a2adaf222f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; - struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; @@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { + struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), @@ -1692,7 +1692,7 @@ again: force_page_uptodate); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); break; } @@ -1704,7 +1704,7 @@ again: if (extents_locked == -EAGAIN) goto again; btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); ret = extents_locked; break; } @@ -1758,11 +1758,21 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, &cached_state); + + /* + * If we have not locked the extent range, because the range's + * start offset is >= i_size, we might still have a non-NULL + * cached extent state, acquired while marking the extent range + * as delalloc through btrfs_dirty_pages(). Therefore free any + * possible cached extent state to avoid a memory leak. + */ if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, - true); + else + free_extent_state(cached_state); + + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -2056,25 +2066,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - u64 len; - /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges, and assertion failures during - * hole detection. - */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } - - /* - * The range length can be represented by u64, we have to do the typecasts - * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() - */ - len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); @@ -2101,6 +2093,19 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges, and assertion failures during + * hole detection. Do this while holding the inode lock, to avoid races + * with other tasks. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + + /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for * any such pages does not start while we are logging the inode, because @@ -2127,8 +2132,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. + * + * Also, the range length can be represented by u64, we have to do the + * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. */ - ret = btrfs_wait_ordered_range(inode, start, len); + ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); if (ret) { up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2e8bb402050b..e2f49615c429 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -484,12 +484,13 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); + btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d51d9466feb0..1b85278471f6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2167,7 +2167,7 @@ again: ClearPageChecked(page); set_page_dirty(page); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -4912,7 +4912,7 @@ again: if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; goto out; } @@ -4980,7 +4980,7 @@ out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: @@ -6276,13 +6276,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; int nitems = name ? 2 : 1; unsigned long ptr; + unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); + nofs_flag = memalloc_nofs_save(); inode = new_inode(fs_info->sb); + memalloc_nofs_restore(nofs_flag); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); @@ -8682,7 +8685,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: if (wakeup) @@ -9035,7 +9038,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -9044,7 +9047,7 @@ again: out_unlock: unlock_page(page); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 818f7ec8bb0e..8dad66df74ed 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1360,8 +1360,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out: @@ -1372,8 +1371,7 @@ out: btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - true); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 001efc9ba1e7..60a00f6ca18f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3617,7 +3617,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, (s64)num_bytes); + trace_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -3664,7 +3664,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); + trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, num_bytes, type); } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e87cbdad02a3..b57f3618e58e 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret; + int i = 0, tree_block_level = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 7f219851fa23..572314aebdf1 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1434,6 +1434,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return 0; + if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; @@ -2186,7 +2193,6 @@ static int clean_dirty_subvols(struct reloc_control *rc) /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; - clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; if (reloc_root) { @@ -2195,6 +2201,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { /* Orphan reloc tree, just clean it up */ @@ -3269,6 +3276,8 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!page) { btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); ret = -ENOMEM; goto out; } @@ -3289,7 +3298,7 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); ret = -EIO; goto out; } @@ -3318,7 +3327,7 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, @@ -3334,8 +3343,7 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c3c0c064c25d..91c702b4cae9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5070,7 +5070,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *path; struct btrfs_key key; int ret; - u64 clone_src_i_size; + u64 clone_src_i_size = 0; /* * Prevent cloning from a zero offset with a length matching the sector diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1bfd7e34f31e..c6bafb8b5f42 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2932,7 +2932,8 @@ out: * in the tree of log roots */ static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) + struct btrfs_root *log, + struct btrfs_root_item *root_item) { struct btrfs_fs_info *fs_info = log->fs_info; int ret; @@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans, if (log->log_transid == 1) { /* insert root item on the first sync */ ret = btrfs_insert_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } else { ret = btrfs_update_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } return ret; } @@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = fs_info->log_root_tree; + struct btrfs_root_item new_root_item; int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; @@ -3104,18 +3106,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } + /* + * We _must_ update under the root->log_mutex in order to make sure we + * have a consistent view of the log root we are trying to commit at + * this moment. + * + * We _must_ copy this into a local copy, because we are not holding the + * log_root_tree->log_mutex yet. This is important because when we + * commit the log_root_tree we must have a consistent view of the + * log_root_tree when we update the super block to point at the + * log_root_tree bytenr. If we update the log_root_tree here we'll race + * with the commit and possibly point at the new block which we may not + * have written out. + */ btrfs_set_root_node(&log->root_item, log->node); + memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; /* - * Update or create log root item under the root's log_mutex to prevent - * races with concurrent log syncs that can lead to failure to update - * log root item because it was not created yet. - */ - ret = update_log_root(trans, log); - /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to * new positions. so it's safe to allow log writers to go in. @@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); mutex_lock(&log_root_tree->log_mutex); + + /* + * Now we are safe to update the log_root_tree because we're under the + * log_mutex, and we're a current writer so we're holding the commit + * open until we drop the log_mutex. + */ + ret = update_log_root(trans, log, &new_root_item); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* atomic_dec_and_test implies a barrier */ cond_wake_up_nomb(&log_root_tree->log_writer_wait); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e821a0e97cd8..9c057609eaec 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3854,7 +3854,11 @@ static int alloc_profile_is_valid(u64 flags, int extended) return !extended; /* "0" is valid for usual profiles */ /* true if exactly one bit set */ - return is_power_of_2(flags); + /* + * Don't use is_power_of_2(unsigned long) because it won't work + * for the single profile (1ULL << 48) on 32-bit CPUs. + */ + return flags != 0 && (flags & (flags - 1)) == 0; } static inline int balance_need_close(struct btrfs_fs_info *fs_info) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b11af7d8e8e9..61282b77950f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -384,8 +384,8 @@ static int parse_reply_info_readdir(void **p, void *end, } done: - if (*p != end) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; bad: @@ -406,12 +406,10 @@ static int parse_reply_info_filelock(void **p, void *end, goto bad; info->filelock_reply = *p; - *p += sizeof(*info->filelock_reply); - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; - bad: return -EIO; } @@ -425,18 +423,21 @@ static int parse_reply_info_create(void **p, void *end, { if (features == (u64)-1 || (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { + /* Malformed reply? */ if (*p == end) { info->has_create_ino = false; } else { info->has_create_ino = true; - info->ino = ceph_decode_64(p); + ceph_decode_64_safe(p, end, info->ino, bad); } + } else { + if (*p != end) + goto bad; } - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; - bad: return -EIO; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 5ef5a16c01d2..7289d443bfb3 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1379,6 +1379,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); struct cifsInodeInfo { bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ + /* + * NOTE: Some code paths call down_read(lock_sem) twice, so + * we must always use use cifs_down_write() instead of down_write() + * for this semaphore to avoid deadlocks. + */ struct rw_semaphore lock_sem; /* protect the fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 592a6cea2b79..65b07f92bc71 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -166,6 +166,7 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); +extern void cifs_down_write(struct rw_semaphore *sem); extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8ee57d1f507f..8995c03056e3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -556,9 +556,11 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + kref_get(&mid_entry->refcount); if (mid_entry->mid_state == MID_REQUEST_SUBMITTED) mid_entry->mid_state = MID_RETRY_NEEDED; list_move(&mid_entry->qhead, &retry_list); + mid_entry->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); mutex_unlock(&server->srv_mutex); @@ -568,6 +570,7 @@ cifs_reconnect(struct TCP_Server_Info *server) mid_entry = list_entry(tmp, struct mid_q_entry, qhead); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); + cifs_mid_q_entry_release(mid_entry); } if (cifs_rdma_enabled(server)) { @@ -887,8 +890,10 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) if (mid->mid_flags & MID_DELETED) printk_once(KERN_WARNING "trying to dequeue a deleted mid\n"); - else + else { list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } spin_unlock(&GlobalMid_Lock); } @@ -958,8 +963,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid); + kref_get(&mid_entry->refcount); mid_entry->mid_state = MID_SHUTDOWN; list_move(&mid_entry->qhead, &dispose_list); + mid_entry->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); @@ -969,6 +976,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); + cifs_mid_q_entry_release(mid_entry); } /* 1/8th of sec is more than enough time for them to exit */ msleep(125); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index be424e81e3ad..1cc1f298e01a 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -738,10 +738,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, static int cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { + struct inode *inode; + if (flags & LOOKUP_RCU) return -ECHILD; if (d_really_is_positive(direntry)) { + inode = d_inode(direntry); + if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode))) + CIFS_I(inode)->time = 0; /* force reval */ + if (cifs_revalidate_dentry(direntry)) return 0; else { @@ -752,7 +758,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) * attributes will have been updated by * cifs_revalidate_dentry(). */ - if (IS_AUTOMOUNT(d_inode(direntry)) && + if (IS_AUTOMOUNT(inode) && !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { spin_lock(&direntry->d_lock); direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 97090693d182..facb52d37d19 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -253,6 +253,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, fid); + if (rc) { + server->ops->close(xid, tcon, fid); + if (rc == -ESTALE) + rc = -EOPENSTALE; + } + out: kfree(buf); return rc; @@ -275,6 +281,13 @@ cifs_has_mand_locks(struct cifsInodeInfo *cinode) return has_locks; } +void +cifs_down_write(struct rw_semaphore *sem) +{ + while (!down_write_trylock(sem)) + msleep(10); +} + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -300,7 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_add(&fdlocks->llist, &cinode->llist); up_write(&cinode->lock_sem); @@ -399,10 +412,11 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) bool oplock_break_cancelled; spin_lock(&tcon->open_file_lock); - + spin_lock(&cifsi->open_file_lock); spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&cifsi->open_file_lock); spin_unlock(&tcon->open_file_lock); return; } @@ -415,9 +429,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open); /* remove it from the lists */ - spin_lock(&cifsi->open_file_lock); list_del(&cifs_file->flist); - spin_unlock(&cifsi->open_file_lock); list_del(&cifs_file->tlist); atomic_dec(&tcon->num_local_opens); @@ -434,6 +446,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_set_oplock_level(cifsi, 0); } + spin_unlock(&cifsi->open_file_lock); spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = wait_oplock_handler ? @@ -458,7 +471,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ - down_write(&cifsi->lock_sem); + cifs_down_write(&cifsi->lock_sem); list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { list_del(&li->llist); cifs_del_lock_waiters(li); @@ -1021,7 +1034,7 @@ static void cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) { struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); } @@ -1043,7 +1056,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, try_again: exist = false; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, lock->type, lock->flags, &conf_lock, @@ -1066,7 +1079,7 @@ try_again: (lock->blist.next == &lock->blist)); if (!rc) goto try_again; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_del_init(&lock->blist); } @@ -1119,7 +1132,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock) return rc; try_again: - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { up_write(&cinode->lock_sem); return rc; @@ -1325,7 +1338,7 @@ cifs_push_locks(struct cifsFileInfo *cfile) int rc = 0; /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { up_write(&cinode->lock_sem); return rc; @@ -1516,7 +1529,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (!buf) return -ENOMEM; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); for (i = 0; i < 2; i++) { cur = buf; num = 0; @@ -1847,13 +1860,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1865,7 +1877,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1873,7 +1885,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return NULL; } @@ -1884,7 +1896,6 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; bool any_available = false; int rc = -EBADF; unsigned int refind = 0; @@ -1904,16 +1915,15 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return rc; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1925,7 +1935,7 @@ refind_writable: if (!open_file->invalidHandle) { /* found a good writable file */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); *ret_file = open_file; return 0; } else { @@ -1945,7 +1955,7 @@ refind_writable: cifsFileInfo_get(inv_file); } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); @@ -1960,7 +1970,7 @@ refind_writable: cifsFileInfo_put(inv_file); ++refind; inv_file = NULL; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); goto refind_writable; } @@ -4399,17 +4409,15 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; - struct cifs_tcon *tcon = - cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 1; } } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 0; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 56ca4b8ccaba..3c952024e10f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -414,6 +414,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -421,6 +422,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if filetype is different, return error */ if (unlikely(((*pinode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -924,6 +926,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } @@ -931,6 +934,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } @@ -2461,9 +2465,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid); cifsFileInfo_put(wfile); if (rc) - return rc; + goto cifs_setattr_exit; } else if (rc != -EBADF) - return rc; + goto cifs_setattr_exit; else rc = 0; } diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index ed92958e842d..657f409d4de0 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {0, 0} }; -static const struct smb_to_posix_error mapping_table_ERRHRD[] = { - {0, 0} -}; - /* * Convert a string containing text IPv4 or IPv6 address to binary form. * diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index b7421a096319..514810694c0f 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -171,6 +171,9 @@ cifs_get_next_mid(struct TCP_Server_Info *server) /* we do not want to loop forever */ last_mid = cur_mid; cur_mid++; + /* avoid 0xFFFF MID */ + if (cur_mid == 0xffff) + cur_mid++; /* * This nested loop looks more expensive than it is. diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index e6a1fc72018f..8b0b512c5792 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -145,7 +145,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, cur = buf; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { if (flock->fl_start > li->offset || (flock->fl_start + length) < diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 5d6d44bfe10a..bb52751ba783 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -86,22 +86,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) static void _cifs_mid_q_entry_release(struct kref *refcount) { - struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry, - refcount); - - mempool_free(mid, cifs_mid_poolp); -} - -void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) -{ - spin_lock(&GlobalMid_Lock); - kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); - spin_unlock(&GlobalMid_Lock); -} - -void -DeleteMidQEntry(struct mid_q_entry *midEntry) -{ + struct mid_q_entry *midEntry = + container_of(refcount, struct mid_q_entry, refcount); #ifdef CONFIG_CIFS_STATS2 __le16 command = midEntry->server->vals->lock_cmd; __u16 smb_cmd = le16_to_cpu(midEntry->command); @@ -166,6 +152,19 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) } } #endif + + mempool_free(midEntry, cifs_mid_poolp); +} + +void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) +{ + spin_lock(&GlobalMid_Lock); + kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); + spin_unlock(&GlobalMid_Lock); +} + +void DeleteMidQEntry(struct mid_q_entry *midEntry) +{ cifs_mid_q_entry_release(midEntry); } @@ -173,8 +172,10 @@ void cifs_delete_mid(struct mid_q_entry *mid) { spin_lock(&GlobalMid_Lock); - list_del_init(&mid->qhead); - mid->mid_flags |= MID_DELETED; + if (!(mid->mid_flags & MID_DELETED)) { + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(mid); @@ -868,7 +869,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) rc = -EHOSTDOWN; break; default: - list_del_init(&mid->qhead); + if (!(mid->mid_flags & MID_DELETED)) { + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", __func__, mid->mid, mid->mid_state); rc = -EIO; @@ -220,10 +220,11 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) for (;;) { entry = xas_find_conflict(xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) + return entry; if (dax_entry_order(entry) < order) return XA_RETRY_ENTRY; - if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || - !dax_is_locked(entry)) + if (!dax_is_locked(entry)) return entry; wq = dax_entry_waitqueue(xas, entry, &ewait.key); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd0f64f7bc06..0c4b6a41e385 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1476,6 +1476,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, is_truncate = true; } + /* Flush dirty data/metadata before non-truncate SETATTR */ + if (is_wb && S_ISREG(inode->i_mode) && + attr->ia_valid & + (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | + ATTR_TIMES_SET)) { + err = write_inode_now(inode, true); + if (err) + return err; + + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); + } + if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 91c99724dee0..d199dc0fbac1 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -201,7 +201,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; - bool lock_inode = (file->f_flags & O_TRUNC) && + bool is_wb_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && fc->writeback_cache; @@ -209,16 +209,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; - if (lock_inode) + if (is_wb_truncate) { inode_lock(inode); + fuse_set_nowrite(inode); + } err = fuse_do_open(fc, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); - if (lock_inode) + if (is_wb_truncate) { + fuse_release_nowrite(inode); inode_unlock(inode); + } return err; } diff --git a/fs/io_uring.c b/fs/io_uring.c index 06d048341fa4..37da4ea68f50 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -221,6 +221,7 @@ struct io_ring_ctx { unsigned sq_entries; unsigned sq_mask; unsigned sq_thread_idle; + unsigned cached_sq_dropped; struct io_uring_sqe *sq_sqes; struct list_head defer_list; @@ -237,6 +238,7 @@ struct io_ring_ctx { /* CQ ring */ struct io_cq_ring *cq_ring; unsigned cached_cq_tail; + atomic_t cached_cq_overflow; unsigned cq_entries; unsigned cq_mask; struct wait_queue_head cq_wait; @@ -336,6 +338,8 @@ struct io_kiocb { #define REQ_F_LINK 64 /* linked sqes */ #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ +#define REQ_F_ISREG 2048 /* regular file */ +#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ u64 user_data; u32 result; u32 sequence; @@ -431,7 +435,8 @@ static inline bool io_sequence_defer(struct io_ring_ctx *ctx, if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped; + return req->sequence != ctx->cached_cq_tail + ctx->sq_ring->dropped + + atomic_read(&ctx->cached_cq_overflow); } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) @@ -511,9 +516,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { - unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); - - WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + WRITE_ONCE(ctx->cq_ring->overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); } } @@ -687,6 +691,14 @@ static unsigned io_cqring_events(struct io_cq_ring *ring) return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); } +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_sq_ring *ring = ctx->sq_ring; + + /* make sure SQ entry isn't read before tail */ + return smp_load_acquire(&ring->r.tail) - ctx->cached_sq_head; +} + /* * Find and free completed poll iocbs */ @@ -816,19 +828,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) { - int iters, ret = 0; + int iters = 0, ret = 0; - /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); - - iters = 0; do { int tmin = 0; @@ -864,30 +868,45 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + return ret; +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret; + + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + ret = __io_iopoll_check(ctx, nr_events, min); mutex_unlock(&ctx->uring_lock); return ret; } -static void kiocb_end_write(struct kiocb *kiocb) +static void kiocb_end_write(struct io_kiocb *req) { - if (kiocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(kiocb->ki_filp); + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct inode *inode = file_inode(req->file); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); } + file_end_write(req->file); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -899,7 +918,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -1013,8 +1033,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - if (force_nonblock && !io_file_supports_async(req->file)) - force_nonblock = false; + if (S_ISREG(file_inode(req->file)->i_mode)) + req->flags |= REQ_F_ISREG; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + return -EAGAIN; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -1035,7 +1064,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return ret; /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) || + (req->file->f_flags & O_NONBLOCK)) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -1048,6 +1078,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; + req->result = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -1269,7 +1300,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, * need async punt anyway, so it's more efficient to do it * here. */ - if (force_nonblock && ret2 > 0 && ret2 < read_size) + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && + (req->flags & REQ_F_ISREG) && + ret2 > 0 && ret2 < read_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -1336,7 +1369,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, * released so that it doesn't complain about the held lock when * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) { + if (req->flags & REQ_F_ISREG) { __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, @@ -2079,7 +2112,13 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, } ret = __io_submit_sqe(ctx, req, s, true); - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + + /* + * We async punt it if the file wasn't marked NOWAIT, or if the file + * doesn't support non-blocking read/write attempts + */ + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || + (req->flags & REQ_F_MUST_PUNT))) { struct io_uring_sqe *sqe_copy; sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); @@ -2150,6 +2189,8 @@ err: return; } + req->user_data = s->sqe->user_data; + /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -2255,12 +2296,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; - ring->dropped++; + ctx->cached_sq_dropped++; + WRITE_ONCE(ring->dropped, ctx->cached_sq_dropped); return false; } -static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, - unsigned int nr, bool has_user, bool mm_fault) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, + bool has_user, bool mm_fault) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -2273,6 +2315,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, } for (i = 0; i < nr; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. @@ -2281,16 +2328,16 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, io_queue_sqe(ctx, link, &link->submit); link = NULL; } - prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; if (unlikely(mm_fault)) { - io_cqring_add_event(ctx, sqes[i].sqe->user_data, + io_cqring_add_event(ctx, s.sqe->user_data, -EFAULT); } else { - sqes[i].has_user = has_user; - sqes[i].needs_lock = true; - sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link); + s.has_user = has_user; + s.needs_lock = true; + s.needs_fixed_file = true; + io_submit_sqe(ctx, &s, statep, &link); submitted++; } } @@ -2305,7 +2352,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, static int io_sq_thread(void *data) { - struct sqe_submit sqes[IO_IOPOLL_BATCH]; struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; mm_segment_t old_fs; @@ -2320,14 +2366,27 @@ static int io_sq_thread(void *data) timeout = inflight = 0; while (!kthread_should_park()) { - bool all_fixed, mm_fault = false; - int i; + bool mm_fault = false; + unsigned int to_submit; if (inflight) { unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - io_iopoll_check(ctx, &nr_events, 0); + /* + * inflight is the count of the maximum possible + * entries we submitted, but it can be smaller + * if we dropped some of them. If we don't have + * poll entries available, then we know that we + * have nothing left to poll for. Reset the + * inflight count to zero in that case. + */ + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->poll_list)) + __io_iopoll_check(ctx, &nr_events, 0); + else + inflight = 0; + mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -2341,7 +2400,8 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { /* * We're polling. If we're within the defined idle * period, then let us spin without work before going @@ -2372,7 +2432,8 @@ static int io_sq_thread(void *data) /* make sure to read SQ tail after writing flags */ smp_mb(); - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { if (kthread_should_park()) { finish_wait(&ctx->sqo_wait, &wait); break; @@ -2390,19 +2451,8 @@ static int io_sq_thread(void *data) ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; } - i = 0; - all_fixed = true; - do { - if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) - all_fixed = false; - - i++; - if (i == ARRAY_SIZE(sqes)) - break; - } while (io_get_sqring(ctx, &sqes[i])); - /* Unless all new commands are FIXED regions, grab mm */ - if (!all_fixed && !cur_mm) { + if (!cur_mm) { mm_fault = !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -2410,8 +2460,9 @@ static int io_sq_thread(void *data) } } - inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, - mm_fault); + to_submit = min(to_submit, ctx->sq_entries); + inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL, + mm_fault); /* Commit SQ ring head once we've consumed all SQEs */ io_commit_sqring(ctx); @@ -2462,13 +2513,14 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) submit++; io_submit_sqe(ctx, &s, statep, &link); } - io_commit_sqring(ctx); if (link) io_queue_sqe(ctx, link, &link->submit); if (statep) io_submit_state_end(statep); + io_commit_sqring(ctx); + return submit; } @@ -2566,7 +2618,8 @@ static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; - io_finish_async(ctx); + if (ctx->sqo_wq) + flush_workqueue(ctx->sqo_wq); unix_destruct_scm(skb); } diff --git a/fs/libfs.c b/fs/libfs.c index c9b2850c0f7c..8e023b08a240 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,58 +89,47 @@ int dcache_dir_close(struct inode *inode, struct file *file) EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ -static struct dentry *next_positive(struct dentry *parent, - struct list_head *from, - int count) +/* + * Returns an element of siblings' list. + * We are looking for <count>th positive after <p>; if + * found, dentry is grabbed and passed to caller via *<res>. + * If no such element exists, the anchor of list is returned + * and *<res> is set to NULL. + */ +static struct list_head *scan_positives(struct dentry *cursor, + struct list_head *p, + loff_t count, + struct dentry **res) { - unsigned *seq = &parent->d_inode->i_dir_seq, n; - struct dentry *res; - struct list_head *p; - bool skipped; - int i; + struct dentry *dentry = cursor->d_parent, *found = NULL; -retry: - i = count; - skipped = false; - n = smp_load_acquire(seq) & ~1; - res = NULL; - rcu_read_lock(); - for (p = from->next; p != &parent->d_subdirs; p = p->next) { + spin_lock(&dentry->d_lock); + while ((p = p->next) != &dentry->d_subdirs) { struct dentry *d = list_entry(p, struct dentry, d_child); - if (!simple_positive(d)) { - skipped = true; - } else if (!--i) { - res = d; - break; + // we must at least skip cursors, to avoid livelocks + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + if (simple_positive(d) && !--count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(found)) + break; + count = 1; + } + if (need_resched()) { + list_move(&cursor->d_child, p); + p = &cursor->d_child; + spin_unlock(&dentry->d_lock); + cond_resched(); + spin_lock(&dentry->d_lock); } } - rcu_read_unlock(); - if (skipped) { - smp_rmb(); - if (unlikely(*seq != n)) - goto retry; - } - return res; -} - -static void move_cursor(struct dentry *cursor, struct list_head *after) -{ - struct dentry *parent = cursor->d_parent; - unsigned n, *seq = &parent->d_inode->i_dir_seq; - spin_lock(&parent->d_lock); - for (;;) { - n = *seq; - if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) - break; - cpu_relax(); - } - __list_del(cursor->d_child.prev, cursor->d_child.next); - if (after) - list_add(&cursor->d_child, after); - else - list_add_tail(&cursor->d_child, &parent->d_subdirs); - smp_store_release(seq, n + 2); - spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + dput(*res); + *res = found; + return p; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) @@ -158,17 +147,28 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) return -EINVAL; } if (offset != file->f_pos) { + struct dentry *cursor = file->private_data; + struct dentry *to = NULL; + struct list_head *p; + file->f_pos = offset; - if (file->f_pos >= 2) { - struct dentry *cursor = file->private_data; - struct dentry *to; - loff_t n = file->f_pos - 2; - - inode_lock_shared(dentry->d_inode); - to = next_positive(dentry, &dentry->d_subdirs, n); - move_cursor(cursor, to ? &to->d_child : NULL); - inode_unlock_shared(dentry->d_inode); + inode_lock_shared(dentry->d_inode); + + if (file->f_pos > 2) { + p = scan_positives(cursor, &dentry->d_subdirs, + file->f_pos - 2, &to); + spin_lock(&dentry->d_lock); + list_move(&cursor->d_child, p); + spin_unlock(&dentry->d_lock); + } else { + spin_lock(&dentry->d_lock); + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); } + + dput(to); + + inode_unlock_shared(dentry->d_inode); } return offset; } @@ -190,25 +190,29 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p = &cursor->d_child; - struct dentry *next; - bool moved = false; + struct list_head *anchor = &dentry->d_subdirs; + struct dentry *next = NULL; + struct list_head *p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = &dentry->d_subdirs; - while ((next = next_positive(dentry, p, 1)) != NULL) { + p = anchor; + else + p = &cursor->d_child; + + while ((p = scan_positives(cursor, p, 1, &next)) != anchor) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; - moved = true; - p = &next->d_child; ctx->pos++; } - if (moved) - move_cursor(cursor, p); + spin_lock(&dentry->d_lock); + list_move_tail(&cursor->d_child, p); + spin_unlock(&dentry->d_lock); + dput(next); + return 0; } EXPORT_SYMBOL(dcache_readdir); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 071b90a45933..ad7a77101471 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1181,7 +1181,7 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode) if (delegation != NULL && nfs4_stateid_match_other(dst, &delegation->stateid)) { dst->seqid = delegation->stateid.seqid; - return ret; + ret = true; } rcu_read_unlock(); out: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 222d7115db71..98a9a0bcdf38 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -123,32 +123,49 @@ static inline int put_dreq(struct nfs_direct_req *dreq) } static void -nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) +nfs_direct_handle_truncated(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr, + ssize_t dreq_len) { - int i; - ssize_t count; + struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx]; + + if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || + test_bit(NFS_IOHDR_EOF, &hdr->flags))) + return; + if (dreq->max_count >= dreq_len) { + dreq->max_count = dreq_len; + if (dreq->count > dreq_len) + dreq->count = dreq_len; + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) + dreq->error = hdr->error; + else /* Clear outstanding error if this is EOF */ + dreq->error = 0; + } + if (mirror->count > dreq_len) + mirror->count = dreq_len; +} - WARN_ON_ONCE(dreq->count >= dreq->max_count); +static void +nfs_direct_count_bytes(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr) +{ + struct nfs_direct_mirror *mirror = &dreq->mirrors[hdr->pgio_mirror_idx]; + loff_t hdr_end = hdr->io_start + hdr->good_bytes; + ssize_t dreq_len = 0; - if (dreq->mirror_count == 1) { - dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; - dreq->count += hdr->good_bytes; - } else { - /* mirrored writes */ - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (hdr_end > dreq->io_start) + dreq_len = hdr_end - dreq->io_start; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + nfs_direct_handle_truncated(dreq, hdr, dreq_len); - dreq->count = count; - } + if (dreq_len > dreq->max_count) + dreq_len = dreq->max_count; + + if (mirror->count < dreq_len) + mirror->count = dreq_len; + if (dreq->count < dreq_len) + dreq->count = dreq_len; } /* @@ -402,20 +419,12 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_direct_req *dreq = hdr->dreq; spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } - if (hdr->good_bytes != 0) - nfs_direct_good_bytes(dreq, hdr); - - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) - dreq->error = 0; - + nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { @@ -652,6 +661,9 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->max_count = 0; + list_for_each_entry(req, &reqs, wb_list) + dreq->max_count += req->wb_bytes; dreq->verf.committed = NFS_INVALID_STABLE_HOW; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); for (i = 0; i < dreq->mirror_count; i++) @@ -791,17 +803,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } + nfs_direct_count_bytes(dreq, hdr); if (hdr->good_bytes != 0) { - nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) request_commit = true; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1406858bae6c..e1e7d2724b97 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6058,6 +6058,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, } status = task->tk_status; if (setclientid.sc_cred) { + kfree(clp->cl_acceptor); clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 85ca49549b39..52cab65f91cf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -786,7 +786,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; - atomic_long_dec(&nfsi->nrequests); if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; @@ -799,8 +798,10 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_unlock(&mapping->private_lock); } - if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { nfs_release_request(req); + atomic_long_dec(&nfsi->nrequests); + } } static void diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a4c905d6b575..9b827143a350 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2042,7 +2042,8 @@ out_write_size: inode->i_mtime = inode->i_ctime = current_time(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_update_inode_fsync_trans(handle, inode, 1); + if (handle) + ocfs2_update_inode_fsync_trans(handle, inode, 1); } if (handle) ocfs2_journal_dirty(handle, wc->w_di_bh); @@ -2139,13 +2140,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct ocfs2_dio_write_ctxt *dwc = NULL; struct buffer_head *di_bh = NULL; u64 p_blkno; - loff_t pos = iblock << inode->i_sb->s_blocksize_bits; + unsigned int i_blkbits = inode->i_sb->s_blocksize_bits; + loff_t pos = iblock << i_blkbits; + sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits; unsigned len, total_len = bh_result->b_size; int ret = 0, first_get_block = 0; len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); len = min(total_len, len); + /* + * bh_result->b_size is count in get_more_blocks according to write + * "pos" and "end", we need map twice to return different buffer state: + * 1. area in file size, not set NEW; + * 2. area out file size, set NEW. + * + * iblock endblk + * |--------|---------|---------|--------- + * |<-------area in file------->| + */ + + if ((iblock <= endblk) && + ((iblock + ((len - 1) >> i_blkbits)) > endblk)) + len = (endblk - iblock + 1) << i_blkbits; + mlog(0, "get block of %lu at %llu:%u req %u\n", inode->i_ino, pos, len, total_len); @@ -2229,6 +2247,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, if (desc->c_needs_zero) set_buffer_new(bh_result); + if (iblock > endblk) + set_buffer_new(bh_result); + /* May sleep in end_io. It should not happen in a irq context. So defer * it to dio work queue. */ set_buffer_defer_completion(bh_result); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d6f7b299eb23..efeea208fdeb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, if (inode_alloc) inode_lock(inode_alloc); - if (o2info_coherent(&fi->ifi_req)) { + if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 930e3d388579..699a560efbb0 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -217,7 +217,8 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) /* At this point, we know that no more recovery threads can be * launched, so wait for any recovery completion work to * complete. */ - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); /* * Now that recovery is shut down, and the osb is about to be diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 158e5af767fd..720e9f94957e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -377,7 +377,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) struct ocfs2_dinode *alloc = NULL; cancel_delayed_work(&osb->la_enable_wq); - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 90c830e3758e..d8507972ee13 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, return loc->xl_ops->xlo_check_space(loc, xi); } -static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) -{ - loc->xl_ops->xlo_add_entry(loc, name_hash); - loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); - /* - * We can't leave the new entry's xe_name_offset at zero or - * add_namevalue() will go nuts. We set it to the size of our - * storage so that it can never be less than any other entry. - */ - loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); -} - static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_info *xi) { @@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, if (rc) goto out; - if (loc->xl_entry) { - if (ocfs2_xa_can_reuse_entry(loc, xi)) { - orig_value_size = loc->xl_entry->xe_value_size; - rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); - if (rc) - goto out; - goto alloc_value; - } + if (!loc->xl_entry) { + rc = -EINVAL; + goto out; + } - if (!ocfs2_xattr_is_local(loc->xl_entry)) { - orig_clusters = ocfs2_xa_value_clusters(loc); - rc = ocfs2_xa_value_truncate(loc, 0, ctxt); - if (rc) { - mlog_errno(rc); - ocfs2_xa_cleanup_value_truncate(loc, - "overwriting", - orig_clusters); - goto out; - } + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; + } + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); + goto out; } - ocfs2_xa_wipe_namevalue(loc); - } else - ocfs2_xa_add_entry(loc, name_hash); + } + ocfs2_xa_wipe_namevalue(loc); /* * If we get here, we have a blank entry. Fill it. We grow our diff --git a/fs/proc/page.c b/fs/proc/page.c index 544d1ee15aee..7c952ee732e6 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -42,10 +42,12 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); + if (!ppage || PageSlab(ppage) || page_has_type(ppage)) pcount = 0; else @@ -216,10 +218,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); if (put_user(stable_page_flags(ppage), out)) { ret = -EFAULT; @@ -261,10 +264,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); if (ppage) ino = page_cgroup_ino(ppage); diff --git a/fs/readdir.c b/fs/readdir.c index 2f6a4534e0df..d26d5ea4de7b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -20,9 +20,23 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/compat.h> - #include <linux/uaccess.h> +#include <asm/unaligned.h> + +/* + * Note the "unsafe_put_user() semantics: we goto a + * label for errors. + */ +#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ + char __user *dst = (_dst); \ + const char *src = (_src); \ + size_t len = (_len); \ + unsafe_put_user(0, dst+len, label); \ + unsafe_copy_to_user(dst, src, len, label); \ +} while (0) + + int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); @@ -65,6 +79,40 @@ out: EXPORT_SYMBOL(iterate_dir); /* + * POSIX says that a dirent name cannot contain NULL or a '/'. + * + * It's not 100% clear what we should really do in this case. + * The filesystem is clearly corrupted, but returning a hard + * error means that you now don't see any of the other names + * either, so that isn't a perfect alternative. + * + * And if you return an error, what error do you use? Several + * filesystems seem to have decided on EUCLEAN being the error + * code for EFSCORRUPTED, and that may be the error to use. Or + * just EIO, which is perhaps more obvious to users. + * + * In order to see the other file names in the directory, the + * caller might want to make this a "soft" error: skip the + * entry, and return the error at the end instead. + * + * Note that this should likely do a "memchr(name, 0, len)" + * check too, since that would be filesystem corruption as + * well. However, that case can't actually confuse user space, + * which has to do a strlen() on the name anyway to find the + * filename length, and the above "soft error" worry means + * that it's probably better left alone until we have that + * issue clarified. + */ +static int verify_dirent_name(const char *name, int len) +{ + if (!len) + return -EIO; + if (memchr(name, '/', len)) + return -EIO; + return 0; +} + +/* * Traditional linux readdir() handling.. * * "count=1" is a special case, meaning that the buffer is one @@ -173,6 +221,9 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -182,28 +233,31 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EOVERFLOW; } dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -259,34 +313,38 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(ino, &dirent->d_ino)) - goto efault; - if (__put_user(0, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, &dirent->d_type, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; |