diff options
Diffstat (limited to 'fs')
144 files changed, 2004 insertions, 860 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index 02a9237807a7..85ae4953f2d7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -365,6 +365,7 @@ config GRACE_PERIOD config LOCKD tristate depends on FILE_LOCKING + select CRC32 select GRACE_PERIOD config LOCKD_V4 diff --git a/fs/affs/file.c b/fs/affs/file.c index 04c018e19602..93b319917c9a 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -597,7 +597,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) BUG_ON(tmp > bsize); AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); affs_fix_checksum(sb, bh); bh->b_state &= ~(1UL << BH_New); @@ -726,7 +726,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, tmp = min(bsize - boff, to - from); BUG_ON(boff + tmp > bsize || tmp > bsize); memcpy(AFFS_DATA(bh) + boff, data + from, tmp); - be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32( + max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size))); affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); written += tmp; @@ -748,7 +749,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize); AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); @@ -782,7 +783,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fb2c8d14327a..3ff7d2e47c7e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -110,25 +110,6 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) -static int set_brk(unsigned long start, unsigned long end, int prot) -{ - start = ELF_PAGEALIGN(start); - end = ELF_PAGEALIGN(end); - if (end > start) { - /* - * Map the last of the bss segment. - * If the header is requesting these pages to be - * executable, honour that (ppc32 needs this). - */ - int error = vm_brk_flags(start, end - start, - prot & PROT_EXEC ? VM_EXEC : 0); - if (error) - return error; - } - current->mm->start_brk = current->mm->brk = end; - return 0; -} - /* We need to explicitly zero any fractional pages after the data section (i.e. bss). This would contain the junk from the file that should not @@ -406,6 +387,51 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } +static unsigned long elf_load(struct file *filep, unsigned long addr, + const struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) +{ + unsigned long zero_start, zero_end; + unsigned long map_addr; + + if (eppnt->p_filesz) { + map_addr = elf_map(filep, addr, eppnt, prot, type, total_size); + if (BAD_ADDR(map_addr)) + return map_addr; + if (eppnt->p_memsz > eppnt->p_filesz) { + zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_filesz; + zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + + /* Zero the end of the last mapped page */ + padzero(zero_start); + } + } else { + map_addr = zero_start = ELF_PAGESTART(addr); + zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + } + if (eppnt->p_memsz > eppnt->p_filesz) { + /* + * Map the last of the segment. + * If the header is requesting these pages to be + * executable, honour that (ppc32 needs this). + */ + int error; + + zero_start = ELF_PAGEALIGN(zero_start); + zero_end = ELF_PAGEALIGN(zero_end); + + error = vm_brk_flags(zero_start, zero_end - zero_start, + prot & PROT_EXEC ? VM_EXEC : 0); + if (error) + map_addr = error; + } + return map_addr; +} + + static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; @@ -828,8 +854,8 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; - unsigned long elf_bss, elf_brk; - int bss_prot = 0; + unsigned long elf_brk; + bool brk_moved = false; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1021,7 +1047,6 @@ out_free_interp: if (retval < 0) goto out_free_dentry; - elf_bss = 0; elf_brk = 0; start_code = ~0UL; @@ -1041,33 +1066,6 @@ out_free_interp: if (elf_ppnt->p_type != PT_LOAD) continue; - if (unlikely (elf_brk > elf_bss)) { - unsigned long nbyte; - - /* There was a PT_LOAD segment with p_memsz > p_filesz - before this one. Map anonymous pages, if needed, - and clear the area. */ - retval = set_brk(elf_bss + load_bias, - elf_brk + load_bias, - bss_prot); - if (retval) - goto out_free_dentry; - nbyte = ELF_PAGEOFFSET(elf_bss); - if (nbyte) { - nbyte = ELF_MIN_ALIGN - nbyte; - if (nbyte > elf_brk - elf_bss) - nbyte = elf_brk - elf_bss; - if (clear_user((void __user *)elf_bss + - load_bias, nbyte)) { - /* - * This bss-zeroing can fail if the ELF - * file specifies odd protections. So - * we don't check the return value - */ - } - } - } - elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); @@ -1095,15 +1093,49 @@ out_free_interp: * Header for ET_DYN binaries to calculate the * randomization (load_bias) for all the LOAD * Program Headers. + */ + + /* + * Calculate the entire size of the ELF mapping + * (total_size), used for the initial mapping, + * due to load_addr_set which is set to true later + * once the initial mapping is performed. + * + * Note that this is only sensible when the LOAD + * segments are contiguous (or overlapping). If + * used for LOADs that are far apart, this would + * cause the holes between LOADs to be mapped, + * running the risk of having the mapping fail, + * as it would be larger than the ELF file itself. + * + * As a result, only ET_DYN does this, since + * some ET_EXEC (e.g. ia64) may have large virtual + * memory holes between LOADs. + * + */ + total_size = total_mapping_size(elf_phdata, + elf_ex->e_phnum); + if (!total_size) { + retval = -EINVAL; + goto out_free_dentry; + } + + /* Calculate any requested alignment. */ + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + + /** + * DOC: PIE handling * - * There are effectively two types of ET_DYN - * binaries: programs (i.e. PIE: ET_DYN with INTERP) - * and loaders (ET_DYN without INTERP, since they - * _are_ the ELF interpreter). The loaders must - * be loaded away from programs since the program - * may otherwise collide with the loader (especially - * for ET_EXEC which does not have a randomized - * position). For example to handle invocations of + * There are effectively two types of ET_DYN ELF + * binaries: programs (i.e. PIE: ET_DYN with + * PT_INTERP) and loaders (i.e. static PIE: ET_DYN + * without PT_INTERP, usually the ELF interpreter + * itself). Loaders must be loaded away from programs + * since the program may otherwise collide with the + * loader (especially for ET_EXEC which does not have + * a randomized position). + * + * For example, to handle invocations of * "./ld.so someprog" to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so @@ -1116,17 +1148,49 @@ out_free_interp: * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). + * + * See below for "brk" handling details, which is + * also affected by program vs loader and ASLR. */ if (interpreter) { + /* On ET_DYN with PT_INTERP, we do the ASLR. */ load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + /* Adjust alignment as requested. */ if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; - } else - load_bias = 0; + } else { + /* + * For ET_DYN without PT_INTERP, we rely on + * the architectures's (potentially ASLR) mmap + * base address (via a load_bias of 0). + * + * When a large alignment is requested, we + * must do the allocation at address "0" right + * now to discover where things will load so + * that we can adjust the resulting alignment. + * In this case (load_bias != 0), we can use + * MAP_FIXED_NOREPLACE to make sure the mapping + * doesn't collide with anything. + */ + if (alignment > ELF_MIN_ALIGN) { + load_bias = elf_load(bprm->file, 0, elf_ppnt, + elf_prot, elf_flags, total_size); + if (BAD_ADDR(load_bias)) { + retval = IS_ERR_VALUE(load_bias) ? + PTR_ERR((void*)load_bias) : -EINVAL; + goto out_free_dentry; + } + vm_munmap(load_bias, total_size); + /* Adjust alignment as requested. */ + if (alignment) + load_bias &= ~(alignment - 1); + elf_flags |= MAP_FIXED_NOREPLACE; + } else + load_bias = 0; + } /* * Since load_bias is used for all subsequent loading @@ -1136,34 +1200,9 @@ out_free_interp: * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); - - /* - * Calculate the entire size of the ELF mapping - * (total_size), used for the initial mapping, - * due to load_addr_set which is set to true later - * once the initial mapping is performed. - * - * Note that this is only sensible when the LOAD - * segments are contiguous (or overlapping). If - * used for LOADs that are far apart, this would - * cause the holes between LOADs to be mapped, - * running the risk of having the mapping fail, - * as it would be larger than the ELF file itself. - * - * As a result, only ET_DYN does this, since - * some ET_EXEC (e.g. ia64) may have large virtual - * memory holes between LOADs. - * - */ - total_size = total_mapping_size(elf_phdata, - elf_ex->e_phnum); - if (!total_size) { - retval = -EINVAL; - goto out_free_dentry; - } } - error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, + error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR_VALUE(error) ? @@ -1211,41 +1250,23 @@ out_free_interp: k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; - if (k > elf_bss) - elf_bss = k; if ((elf_ppnt->p_flags & PF_X) && end_code < k) end_code = k; if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; - if (k > elf_brk) { - bss_prot = elf_prot; + if (k > elf_brk) elf_brk = k; - } } e_entry = elf_ex->e_entry + load_bias; phdr_addr += load_bias; - elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; start_data += load_bias; end_data += load_bias; - /* Calling set_brk effectively mmaps the pages that we need - * for the bss and break sections. We must do this before - * mapping in the interpreter, to make sure it doesn't wind - * up getting placed where the bss needs to go. - */ - retval = set_brk(elf_bss, elf_brk, bss_prot); - if (retval) - goto out_free_dentry; - if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { - retval = -EFAULT; /* Nobody gets to see this, but.. */ - goto out_free_dentry; - } - if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, @@ -1301,24 +1322,44 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; - if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) { + /** + * DOC: "brk" handling + * + * For architectures with ELF randomization, when executing a + * loader directly (i.e. static PIE: ET_DYN without PT_INTERP), + * move the brk area out of the mmap region and into the unused + * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide + * early with the stack growing down or other regions being put + * into the mmap region by the kernel (e.g. vdso). + * + * In the CONFIG_COMPAT_BRK case, though, everything is turned + * off because we're not allowed to move the brk at all. + */ + if (!IS_ENABLED(CONFIG_COMPAT_BRK) && + IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + elf_ex->e_type == ET_DYN && !interpreter) { + elf_brk = ELF_ET_DYN_BASE; + /* This counts as moving the brk, so let brk(2) know. */ + brk_moved = true; + } + mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk); + + if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) { /* - * For architectures with ELF randomization, when executing - * a loader directly (i.e. no interpreter listed in ELF - * headers), move the brk area out of the mmap region - * (since it grows up, and may collide early with the stack - * growing down), and into the unused ELF_ET_DYN_BASE region. + * If we didn't move the brk to ELF_ET_DYN_BASE (above), + * leave a gap between .bss and brk. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - elf_ex->e_type == ET_DYN && !interpreter) { - mm->brk = mm->start_brk = ELF_ET_DYN_BASE; - } + if (!brk_moved) + mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; mm->brk = mm->start_brk = arch_randomize_brk(mm); + brk_moved = true; + } + #ifdef compat_brk_randomized + if (brk_moved) current->brk_randomized = 1; #endif - } if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 434cf3d5f4cf..226e6434a58a 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1885,6 +1885,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); /* @@ -1900,13 +1911,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 944a7340f6a4..3981c941f5b5 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -167,13 +167,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -260,9 +254,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +488,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); + return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +553,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 967c6b5dd0a4..34a30d61b470 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4314,6 +4314,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4334,6 +4342,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->delalloc_workers); /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4390,9 +4423,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4529,10 +4559,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 021cf468274b..ef77d4208510 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -164,6 +164,14 @@ search_again: ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); + if (unlikely(num_refs == 0)) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent item (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, ret); + goto out_free; + } extent_flags = btrfs_extent_flags(leaf, ei); } else { ret = -EUCLEAN; @@ -177,8 +185,6 @@ search_again: goto out_free; } - - BUG_ON(num_refs == 0); } else { num_refs = 0; extent_flags = 0; @@ -208,10 +214,19 @@ search_again: goto search_again; } spin_lock(&head->lock); - if (head->extent_op && head->extent_op->update_flags) + if (head->extent_op && head->extent_op->update_flags) { extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); + } else if (unlikely(num_refs == 0)) { + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + spin_unlock(&delayed_refs->lock); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent %llu (%s)", + bytenr, metadata ? "metadata" : "data"); + btrfs_abort_transaction(trans, ret); + goto out_free; + } num_refs += head->ref_mod; spin_unlock(&head->lock); @@ -5540,7 +5555,10 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 1); else ret = btrfs_dec_ref(trans, root, eb, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } if (is_fstree(root->root_key.objectid)) { ret = btrfs_qgroup_trace_leaf_items(trans, eb); if (ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2ae50dcca0f..ed08d8e5639f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3565,10 +3565,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -3604,8 +3604,11 @@ again: free_eb: btrfs_release_extent_buffer(eb); return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 68092b64e29e..e794606e7c78 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2225,15 +2225,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE); while (1) { truncate_pagecache_range(inode, lockstart, lockend); lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + break; /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2245,7 +2250,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * we do, unlock the range and retry. */ if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + page_lockend - 1)) break; unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cedffa567a75..48d257923672 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1546,6 +1546,7 @@ out_unlock: locked_page, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); start += cur_alloc_size; } @@ -1559,6 +1560,7 @@ out_unlock: clear_bits |= EXTENT_CLEAR_DATA_RESV; extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL); } return ret; } @@ -2056,12 +2058,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2222,13 +2225,15 @@ error: */ if (cow_start != (u64)-1) cur_offset = cow_start; - if (cur_offset < end) + if (cur_offset < end) { extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); + } btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 537e184b4b1d..474758c878fc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2931,6 +2931,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, int ret; ASSERT(page_index <= last_index); +again: page = find_lock_page(inode->i_mapping, page_index); if (!page) { page_cache_sync_readahead(inode->i_mapping, ra, NULL, @@ -2952,6 +2953,11 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, ret = -EIO; goto release_page; } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + put_page(page); + goto again; + } } /* diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6be092bb814f..7632d652a125 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -153,12 +153,14 @@ struct scrub_stripe { unsigned int init_nr_io_errors; unsigned int init_nr_csum_errors; unsigned int init_nr_meta_errors; + unsigned int init_nr_meta_gen_errors; /* * The following error bitmaps are all for the current status. * Every time we submit a new read, these bitmaps may be updated. * - * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; + * error_bitmap = io_error_bitmap | csum_error_bitmap | + * meta_error_bitmap | meta_generation_bitmap; * * IO and csum errors can happen for both metadata and data. */ @@ -166,6 +168,7 @@ struct scrub_stripe { unsigned long io_error_bitmap; unsigned long csum_error_bitmap; unsigned long meta_error_bitmap; + unsigned long meta_gen_error_bitmap; /* For writeback (repair or replace) error reporting. */ unsigned long write_error_bitmap; @@ -617,7 +620,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr memcpy(on_disk_csum, header->csum, fs_info->csum_size); if (logical != btrfs_stack_header_bytenr(header)) { - bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", @@ -673,7 +676,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (stripe->sectors[sector_nr].generation != btrfs_stack_header_generation(header)) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_set(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree); bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", @@ -685,6 +688,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + bitmap_clear(&stripe->meta_gen_error_bitmap, sector_nr, sectors_per_tree); } static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) @@ -973,8 +977,22 @@ skip: if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, stripe->logical, physical); + if (test_bit(sector_nr, &stripe->meta_gen_error_bitmap)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("generation error", dev, false, + stripe->logical, physical); } + /* Update the device stats. */ + for (int i = 0; i < stripe->init_nr_io_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); + for (int i = 0; i < stripe->init_nr_csum_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + /* Generation mismatch error is based on each metadata, not each block. */ + for (int i = 0; i < stripe->init_nr_meta_gen_errors; + i += (fs_info->nodesize >> fs_info->sectorsize_bits)) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); + spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; @@ -983,7 +1001,8 @@ skip: sctx->stat.no_csum += nr_nodatacsum_sectors; sctx->stat.read_errors += stripe->init_nr_io_errors; sctx->stat.csum_errors += stripe->init_nr_csum_errors; - sctx->stat.verify_errors += stripe->init_nr_meta_errors; + sctx->stat.verify_errors += stripe->init_nr_meta_errors + + stripe->init_nr_meta_gen_errors; sctx->stat.uncorrectable_errors += bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; @@ -1029,6 +1048,8 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) stripe->nr_sectors); stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); + stripe->init_nr_meta_gen_errors = bitmap_weight(&stripe->meta_gen_error_bitmap, + stripe->nr_sectors); if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) goto out; @@ -1143,6 +1164,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + btrfs_dev_stat_inc_and_print(stripe->dev, + BTRFS_DEV_STAT_WRITE_ERRS); } bio_put(&bbio->bio); @@ -1505,10 +1529,12 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) stripe->init_nr_io_errors = 0; stripe->init_nr_csum_errors = 0; stripe->init_nr_meta_errors = 0; + stripe->init_nr_meta_gen_errors = 0; stripe->error_bitmap = 0; stripe->io_error_bitmap = 0; stripe->csum_error_bitmap = 0; stripe->meta_error_bitmap = 0; + stripe->meta_gen_error_bitmap = 0; } /* @@ -1538,8 +1564,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; - if (unlikely(!extent_root)) { - btrfs_err(fs_info, "no valid extent root for scrub"); + if (unlikely(!extent_root || !csum_root)) { + btrfs_err(fs_info, "no valid extent or csum root for scrub"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index aa1e6d88a72c..e2ead36e5be4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -487,10 +487,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; path_len = p->end - p->start; old_buf_len = p->buf_len; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fb4992570c2b..2045ac3d94c4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1336,8 +1336,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) subvol_name = btrfs_get_subvol_name_from_objectid(info, BTRFS_I(d_inode(dentry))->root->root_key.objectid); if (!IS_ERR(subvol_name)) { - seq_puts(seq, ",subvol="); - seq_escape(seq, subvol_name, " \t\n\\"); + seq_show_option(seq, "subvol", subvol_name); kfree(subvol_name); } return 0; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 53c74010140e..6d16506bbdc0 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1889,6 +1889,11 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_LEVEL; } + if (unlikely(!btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(leaf, 0, "invalid flag for leaf, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + /* * Extent buffers from a relocation tree have a owner field that * corresponds to the subvolume tree they are based on. So just from an @@ -1950,6 +1955,7 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) for (slot = 0; slot < nritems; slot++) { u32 item_end_expected; u64 item_data_end; + enum btrfs_tree_block_status ret; btrfs_item_key_to_cpu(leaf, &key, slot); @@ -2005,21 +2011,10 @@ enum btrfs_tree_block_status __btrfs_check_leaf(struct extent_buffer *leaf) return BTRFS_TREE_BLOCK_INVALID_OFFSETS; } - /* - * We only want to do this if WRITTEN is set, otherwise the leaf - * may be in some intermediate state and won't appear valid. - */ - if (btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_WRITTEN)) { - enum btrfs_tree_block_status ret; - - /* - * Check if the item size and content meet other - * criteria - */ - ret = check_leaf_item(leaf, &key, slot, &prev_key); - if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) - return ret; - } + /* Check if the item size and content meet other criteria. */ + ret = check_leaf_item(leaf, &key, slot, &prev_key); + if (unlikely(ret != BTRFS_TREE_BLOCK_CLEAN)) + return ret; prev_key.objectid = key.objectid; prev_key.type = key.type; @@ -2049,6 +2044,11 @@ enum btrfs_tree_block_status __btrfs_check_node(struct extent_buffer *node) int level = btrfs_header_level(node); u64 bytenr; + if (unlikely(!btrfs_header_flag(node, BTRFS_HEADER_FLAG_WRITTEN))) { + generic_err(node, 0, "invalid flag for node, WRITTEN not set"); + return BTRFS_TREE_BLOCK_WRITTEN_NOT_SET; + } + if (unlikely(level <= 0 || level >= BTRFS_MAX_LEVEL)) { generic_err(node, 0, "invalid level for node, have %d expect [1, %d]", diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 3c2a02a72f64..43f2ceb78f34 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -51,6 +51,7 @@ enum btrfs_tree_block_status { BTRFS_TREE_BLOCK_INVALID_BLOCKPTR, BTRFS_TREE_BLOCK_INVALID_ITEM, BTRFS_TREE_BLOCK_INVALID_OWNER, + BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, }; /* diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c4463c3f2068..197dfafbf401 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2006,6 +2006,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2165,6 +2168,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index db6977c15c28..f0befbeb6cb8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2319,7 +2319,7 @@ static int fill_fscrypt_truncate(struct inode *inode, /* Try to writeback the dirty pagecaches */ if (issued & (CEPH_CAP_FILE_BUFFER)) { - loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; ret = filemap_write_and_wait_range(inode->i_mapping, orig_pos, lend); diff --git a/fs/coredump.c b/fs/coredump.c index 9d235fa14ab9..d3a4f5dc2e36 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -42,6 +42,7 @@ #include <linux/timekeeping.h> #include <linux/sysctl.h> #include <linux/elf.h> +#include <uapi/linux/pidfd.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -56,6 +57,13 @@ static bool dump_vma_snapshot(struct coredump_params *cprm); static void free_vma_snapshot(struct coredump_params *cprm); +/* + * File descriptor number for the pidfd for the thread-group leader of + * the coredumping task installed into the usermode helper's file + * descriptor table. + */ +#define COREDUMP_PIDFD_NUMBER 3 + static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -332,6 +340,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, case 'C': err = cn_printf(cn, "%d", cprm->cpu); break; + /* pidfd number */ + case 'F': { + /* + * Installing a pidfd only makes sense if + * we actually spawn a usermode helper. + */ + if (!ispipe) + break; + + /* + * Note that we'll install a pidfd for the + * thread-group leader. We know that task + * linkage hasn't been removed yet and even if + * this @current isn't the actual thread-group + * leader we know that the thread-group leader + * cannot be reaped until @current has exited. + */ + cprm->pid = task_tgid(current); + err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER); + break; + } default: break; } @@ -488,7 +517,7 @@ static void wait_for_dump_helpers(struct file *file) } /* - * umh_pipe_setup + * umh_coredump_setup * helper function to customize the process used * to collect the core in userspace. Specifically * it sets up a pipe and installs it as fd 0 (stdin) @@ -498,21 +527,61 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; + struct file *pidfs_file = NULL; struct coredump_params *cp = (struct coredump_params *)info->data; - int err = create_pipe_files(files, 0); + int err; + + if (cp->pid) { + int fd; + + fd = pidfd_prepare(cp->pid, 0, &pidfs_file); + if (fd < 0) + return fd; + + /* + * We don't care about the fd. We also cannot simply + * replace it below because dup2() will refuse to close + * this file descriptor if its in a larval state. So + * close it! + */ + put_unused_fd(fd); + + /* + * Usermode helpers are childen of either + * system_unbound_wq or of kthreadd. So we know that + * we're starting off with a clean file descriptor + * table. So we should always be able to use + * COREDUMP_PIDFD_NUMBER as our file descriptor value. + */ + err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0); + if (err < 0) + goto out_fail; + + pidfs_file = NULL; + } + + err = create_pipe_files(files, 0); if (err) - return err; + goto out_fail; cp->file = files[1]; err = replace_fd(0, files[0], 0); fput(files[0]); + if (err < 0) + goto out_fail; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; + err = 0; + +out_fail: + if (pidfs_file) + fput(pidfs_file); return err; } @@ -589,7 +658,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } if (cprm.limit == 1) { - /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. + /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. * * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use @@ -634,7 +703,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + umh_coredump_setup, NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0618af36f550..3c9ab6461579 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1826,8 +1826,8 @@ static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); - return -EINVAL; + log_print("Detect multi-homed hosts but use only the first IP address."); + log_print("Try SCTP, if you want to enable multi-link."); } return 0; diff --git a/fs/exec.c b/fs/exec.c index 4a6255aa4ea7..ee71a315cc51 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1257,13 +1257,12 @@ int begin_new_exec(struct linux_binprm * bprm) */ bprm->point_of_no_return = true; - /* - * Make this the only thread in the thread group. - */ + /* Make this the only thread in the thread group */ retval = de_thread(me); if (retval) goto out; - + /* see the comment in check_unsafe_exec() */ + current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ @@ -1516,6 +1515,8 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) { + /* in case exec fails before de_thread() succeeds */ + current->fs->in_exec = 0; mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1604,6 +1605,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... + * + * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) + * from another sub-thread until de_thread() succeeds, this + * state is protected by cred_guard_mutex we hold. */ t = p; n_fs = 1; @@ -1890,7 +1895,6 @@ static int bprm_execve(struct linux_binprm *bprm, sched_mm_cid_after_execve(current); /* execve succeeded */ - current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); @@ -1910,7 +1914,6 @@ out: out_unmark: sched_mm_cid_after_execve(current); - current->fs->in_exec = 0; current->in_execve = 0; return retval; diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 74590041fb2c..24e1e05f9f34 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -265,7 +265,7 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, clu = next; if (exfat_ent_get(sb, clu, &next)) return -EIO; - } while (next != EXFAT_EOF_CLUSTER); + } while (next != EXFAT_EOF_CLUSTER && count <= p_chain->size); if (p_chain->size != count) { exfat_fs_error(sb, diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 396474e9e2bf..3a2dfc59fb40 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -641,8 +641,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 6fe3c941b565..4d6ba140276b 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line, { __le32 *bref = p; unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + if (journal && inode == journal->j_inode) return 0; while (bref < p+max) { diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 7ea33c3fe94e..6682b8ab11f1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; else return 0; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 60455c84a937..81fe87fcbfa0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -273,7 +273,8 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -2994,6 +2995,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 32218ac7f50f..39e3661a80c4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4660,22 +4660,13 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_mutex; } - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, - end - 1); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, start, end); + if (ret) { + filemap_invalidate_unlock(mapping); + goto out_mutex; } - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 19d7bcf16ebb..86245e27be18 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -378,10 +379,11 @@ static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -3891,6 +3893,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3945,17 +4009,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); - /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - inode_lock(inode); /* No need to punch hole beyond i_size */ @@ -4017,8 +4070,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); + + ret = ext4_truncate_page_cache_block_range(inode, + first_block_offset, last_block_offset + 1); + if (ret) + goto out_dio; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4692,22 +4748,43 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) { + const char *err_str; + if (flags & EXT4_IGET_EA_INODE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "missing EA_INODE flag"; + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - EXT4_I(inode)->i_file_acl) - return "ea_inode with extended attributes"; + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } } else { - if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "unexpected EA_INODE flag"; + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } } - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) - return "unexpected bad inode w/o EXT4_IGET_BAD"; - return NULL; + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; + } + return 0; + +error: + ext4_error_inode(inode, function, line, 0, err_str); + return -EFSCORRUPTED; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4719,7 +4796,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; - const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4748,10 +4824,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); + ret = check_igot_inode(inode, flags, function, line); + if (ret) { iput(inode); - return ERR_PTR(-EFSCORRUPTED); + return ERR_PTR(ret); } return inode; } @@ -5023,13 +5099,21 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); - ret = -EFSCORRUPTED; - goto bad_inode; + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); } - + if (ret) + goto bad_inode; brelse(iloc.bh); + unlock_new_inode(inode); return inode; @@ -5450,7 +5534,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) - goto err_out; + goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 96a048d3f51b..2e4575becd4f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2041,7 +2041,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, * split it in half by count; each resulting block will have at least * half the space free. */ - if (i > 0) + if (i >= 0) split = count - move; else split = count/2; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7ab4f5a9bf5b..7287dbfe13f1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -181,14 +181,25 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -344,6 +355,7 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f019ce64eba4..d2b58f940aab 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2821,6 +2821,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -5421,6 +5428,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "data=, fs mounted w/o journal"); goto failed_mount3a; } + if (test_opt(sb, DATA_ERR_ABORT)) { + ext4_msg(sb, KERN_ERR, + "can't mount with data_err=abort, fs mounted w/o journal"); + goto failed_mount3a; + } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -6771,6 +6783,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6782,9 +6795,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } @@ -6808,22 +6821,29 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; - if (limit && buf->f_blocks > limit) { + if (limit) { + uint64_t remaining = 0; + curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -6926,12 +6946,25 @@ static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); @@ -6942,6 +6975,10 @@ static int ext4_release_dquot(struct dquot *dquot) err = ext4_journal_stop(handle); if (!ret) ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return ret; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index df5ab1a75fc4..ca22aa9e04b4 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1176,15 +1176,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + ext4_get_inode_loc(parent, &iloc); + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; - for (entry = first; !IS_LAST_ENTRY(entry); + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c6317596e695..3ec815e615e7 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1327,21 +1327,13 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) + - NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to no space"); - } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && - f2fs_nat_bitmap_enabled(sbi)) { - f2fs_enable_nat_bits(sbi); - set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Rebuild and enable nat_bits"); - } - } - spin_lock_irqsave(&sbi->cp_lock, flags); + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); else @@ -1524,8 +1516,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if ((cpc->reason & CP_UMOUNT) && - is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { + if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index acd0764b0286..a123bb26acd8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -54,8 +54,8 @@ bool f2fs_is_cp_guaranteed(struct page *page) struct inode *inode; struct f2fs_sb_info *sbi; - if (!mapping) - return false; + if (fscrypt_is_bounce_page(page)) + return page_private_gcing(fscrypt_pagecache_page(page)); inode = mapping->host; sbi = F2FS_I_SB(inode); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d28e3df61cc4..911c4c64d729 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2197,6 +2197,36 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem) #endif } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + unsigned char *nat_bits; + + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + nat_bits = NM_I(sbi)->nat_bits; + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + kvfree(nat_bits); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { f2fs_down_read(&sbi->cp_rwsem); @@ -2444,8 +2474,14 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - sbi->total_valid_block_count -= (block_t)count; + if (unlikely(sbi->total_valid_block_count < count)) { + f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u", + sbi->total_valid_block_count, inode->i_ino, count); + sbi->total_valid_block_count = 0; + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count -= count; + } if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks = min(sbi->reserved_blocks, @@ -3623,7 +3659,6 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -3648,7 +3683,6 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7ad4a9241759..06941705e893 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -35,10 +35,8 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) if (f2fs_inode_dirtied(inode, sync)) return; - if (f2fs_is_atomic_file(inode)) { - set_inode_flag(inode, FI_ATOMIC_DIRTIED); + if (f2fs_is_atomic_file(inode)) return; - } mark_inode_dirty_sync(inode); } @@ -764,8 +762,12 @@ retry: if (err == -ENOENT) return; + if (err == -EFSCORRUPTED) + goto stop_checkpoint; + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; +stop_checkpoint: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2e08e1fdf485..4d6f0a6365fe 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -411,7 +411,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(dir)->i_projid, - F2FS_I(old_dentry->d_inode)->i_projid))) + F2FS_I(inode)->i_projid))) return -EXDEV; err = f2fs_dquot_initialize(dir); @@ -905,7 +905,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, - F2FS_I(old_dentry->d_inode)->i_projid))) + F2FS_I(old_inode)->i_projid))) return -EXDEV; /* @@ -1101,10 +1101,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, - F2FS_I(old_dentry->d_inode)->i_projid)) || - (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + F2FS_I(old_inode)->i_projid)) || + (is_inode_flag_set(old_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(old_dir)->i_projid, - F2FS_I(new_dentry->d_inode)->i_projid))) + F2FS_I(new_inode)->i_projid))) return -EXDEV; err = f2fs_dquot_initialize(old_dir); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dedba481b66d..b00d66b95321 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1134,7 +1134,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); - if (level < 0) { + if (level <= 0) { + if (!level) { + level = -EFSCORRUPTED; + f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + __func__, inode->i_ino, + from, ADDRS_PER_INODE(inode)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } trace_f2fs_truncate_inode_blocks_exit(inode, level); return level; } @@ -2258,24 +2265,6 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int i; - bool ret = true; - - f2fs_down_read(&nm_i->nat_tree_lock); - for (i = 0; i < nm_i->nat_blocks; i++) { - if (!test_bit_le(i, nm_i->nat_block_bitmap)) { - ret = false; - break; - } - } - f2fs_up_read(&nm_i->nat_tree_lock); - - return ret; -} - static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2954,23 +2943,7 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, - unsigned int valid) -{ - if (valid == 0) { - __set_bit_le(nat_ofs, nm_i->empty_nat_bits); - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); - return; - } - - __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_ofs, nm_i->full_nat_bits); - else - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); -} - -static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2979,7 +2952,7 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; if (nat_index == 0) { @@ -2990,36 +2963,17 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - - __update_nat_bits(nm_i, nat_index, valid); -} - -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs; - - f2fs_down_read(&nm_i->nat_tree_lock); - - for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { - unsigned int valid = 0, nid_ofs = 0; - - /* handle nid zero due to it should never be used */ - if (unlikely(nat_ofs == 0)) { - valid = 1; - nid_ofs = 1; - } - - for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { - if (!test_bit_le(nid_ofs, - nm_i->free_nid_bitmap[nat_ofs])) - valid++; - } - - __update_nat_bits(nm_i, nat_ofs, valid); + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; } - f2fs_up_read(&nm_i->nat_tree_lock); + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3038,7 +2992,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if ((cpc->reason & CP_UMOUNT) || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -3085,7 +3039,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - update_nat_bits(sbi, start_nid, page); + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3116,7 +3070,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (cpc->reason & CP_UMOUNT) { + if (enabled_nat_bits(sbi, cpc)) { f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); f2fs_up_write(&nm_i->nat_tree_lock); @@ -3132,7 +3086,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (cpc->reason & CP_UMOUNT || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3169,18 +3123,15 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; + if (!enabled_nat_bits(sbi, NULL)) + return 0; + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) - return 0; - nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3197,12 +3148,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", - cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); + disable_nat_bits(sbi, true); return 0; } + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3213,7 +3165,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; for (i = 0; i < nm_i->nat_blocks; i++) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aa0e7cc2489a..6b3cafbe9867 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1499,6 +1499,10 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync) inc_page_count(sbi, F2FS_DIRTY_IMETA); } spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (!ret && f2fs_is_atomic_file(inode)) + set_inode_flag(inode, FI_ATOMIC_DIRTIED); + return ret; } @@ -1845,9 +1849,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid = u64_to_fsid(id); #ifdef CONFIG_QUOTA - if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + if (is_inode_flag_set(d_inode(dentry), FI_PROJ_INHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) { - f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + f2fs_statfs_project(sb, F2FS_I(d_inode(dentry))->i_projid, buf); } #endif return 0; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 180feefc4a9c..c4b0661888a1 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -61,6 +61,12 @@ struct f2fs_attr { int id; }; +struct f2fs_base_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_base_attr *a, char *buf); + ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); +}; + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); @@ -791,6 +797,25 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static ssize_t f2fs_base_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->show ? a->show(a, buf) : 0; +} + +static ssize_t f2fs_base_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->store ? a->store(a, buf, len) : 0; +} + /* * Note that there are three feature list entries: * 1) /sys/fs/f2fs/features @@ -809,14 +834,13 @@ static void f2fs_sb_release(struct kobject *kobj) * please add new on-disk feature in this list only. * - ref. F2FS_SB_FEATURE_RO_ATTR() */ -static ssize_t f2fs_feature_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) { return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ -static struct f2fs_attr f2fs_attr_##_name = { \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_feature_show, \ } @@ -1166,37 +1190,38 @@ static struct attribute *f2fs_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs); +#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION - ATTR_LIST(encryption), - ATTR_LIST(test_dummy_encryption_v2), + BASE_ATTR_LIST(encryption), + BASE_ATTR_LIST(test_dummy_encryption_v2), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(encrypted_casefold), + BASE_ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED - ATTR_LIST(block_zoned), + BASE_ATTR_LIST(block_zoned), #endif - ATTR_LIST(atomic_write), - ATTR_LIST(extra_attr), - ATTR_LIST(project_quota), - ATTR_LIST(inode_checksum), - ATTR_LIST(flexible_inline_xattr), - ATTR_LIST(quota_ino), - ATTR_LIST(inode_crtime), - ATTR_LIST(lost_found), + BASE_ATTR_LIST(atomic_write), + BASE_ATTR_LIST(extra_attr), + BASE_ATTR_LIST(project_quota), + BASE_ATTR_LIST(inode_checksum), + BASE_ATTR_LIST(flexible_inline_xattr), + BASE_ATTR_LIST(quota_ino), + BASE_ATTR_LIST(inode_crtime), + BASE_ATTR_LIST(lost_found), #ifdef CONFIG_FS_VERITY - ATTR_LIST(verity), + BASE_ATTR_LIST(verity), #endif - ATTR_LIST(sb_checksum), + BASE_ATTR_LIST(sb_checksum), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(casefold), + BASE_ATTR_LIST(casefold), #endif - ATTR_LIST(readonly), + BASE_ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION - ATTR_LIST(compression), + BASE_ATTR_LIST(compression), #endif - ATTR_LIST(pin_file), + BASE_ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1263,9 +1288,14 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; +static const struct sysfs_ops f2fs_feat_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, - .sysfs_ops = &f2fs_attr_ops, + .sysfs_ops = &f2fs_feat_attr_ops, }; static struct kobject f2fs_feat = { diff --git a/fs/file.c b/fs/file.c index a178efc8cf4b..f8cf6728c6a0 100644 --- a/fs/file.c +++ b/fs/file.c @@ -362,17 +362,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho old_fds = old_fdt->fd; new_fds = new_fdt->fd; + /* + * We may be racing against fd allocation from other threads using this + * files_struct, despite holding ->file_lock. + * + * alloc_fd() might have already claimed a slot, while fd_install() + * did not populate it yet. Note the latter operates locklessly, so + * the file can show up as we are walking the array below. + * + * At the same time we know no files will disappear as all other + * operations take the lock. + * + * Instead of trying to placate userspace racing with itself, we + * ref the file if we see it and mark the fd slot as unused otherwise. + */ for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; + struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); @@ -625,7 +633,7 @@ static struct file *pick_file(struct files_struct *files, unsigned fd) return NULL; fd = array_index_nospec(fd, fdt->max_fds); - file = fdt->fd[fd]; + file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -1095,7 +1103,7 @@ __releases(&files->file_lock) */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); - tofree = fdt->fd[fd]; + tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); diff --git a/fs/filesystems.c b/fs/filesystems.c index 58b9067b2391..95e5256821a5 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -156,15 +156,19 @@ static int fs_index(const char __user * __name) static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; - int len, res; + int len, res = -EINVAL; read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) - if (index <= 0 && try_module_get(tmp->owner)) + for (tmp = file_systems; tmp; tmp = tmp->next, index--) { + if (index == 0) { + if (try_module_get(tmp->owner)) + res = 0; break; + } + } read_unlock(&file_systems_lock); - if (!tmp) - return -EINVAL; + if (res) + return res; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..7faf1af59d5d 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -681,7 +681,6 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, 0, 0, fuse_wait_dax_page(inode)); } -/* dmap_end == 0 leads to unmapping of whole file */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 89bffaed421f..82951a535d2d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1121,6 +1121,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + err = -EPERM; return err; } @@ -1875,7 +1877,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) { filemap_invalidate_unlock(mapping); return err; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ceb9f7d23038..3e4c3fcb588b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -241,7 +241,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out_inode_unlock; } @@ -3023,7 +3023,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); if (block_faults) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index d84dacbdce2c..a6814bea0e0a 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1430,6 +1430,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc) unsigned int virtqueue_size; int err = -EIO; + if (!fsc->source) + return invalf(fsc, "No source specified"); + /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() * to drop the reference to this object. diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 2c0908a30210..687670075d22 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -853,11 +853,12 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh = NULL; + struct gfs2_holder *gh; if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) return; + /* While a demote is in progress, the GLF_LOCK flag must be set. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && @@ -869,18 +870,22 @@ __acquires(&gl->gl_lockref.lock) set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; + do_xmote(gl, NULL, gl->gl_target); + return; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; } - do_xmote(gl, gh, gl->gl_target); - return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 29085643ad10..1cb5ce63fbf6 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -658,7 +658,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!IS_ERR(inode)) { if (S_ISDIR(inode->i_mode)) { iput(inode); - inode = ERR_PTR(-EISDIR); + inode = NULL; + error = -EISDIR; goto fail_gunlock; } d_instantiate(dentry, inode); diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 6add6ebfef89..cb823a8a6ba9 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 1; + if (key_len > sizeof(hfs_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfs_btree_key)); + pr_err("hfs: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 87974d5e6791..079ea80534f7 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 2; + if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfsplus_btree_key)); + pr_err("hfsplus: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 8b39c15c408c..15b2f094d36e 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -60,7 +60,7 @@ struct hostfs_stat { unsigned int uid; unsigned int gid; unsigned long long size; - struct hostfs_timespec atime, mtime, ctime; + struct hostfs_timespec atime, mtime, ctime, btime; unsigned int blksize; unsigned long long blocks; struct { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index ff201753fd18..44fe76174e12 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -27,6 +27,7 @@ struct hostfs_inode_info { struct inode vfs_inode; struct mutex open_mutex; dev_t dev; + struct hostfs_timespec btime; }; static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) @@ -557,6 +558,7 @@ static int hostfs_inode_set(struct inode *ino, void *data) } HOSTFS_I(ino)->dev = dev; + HOSTFS_I(ino)->btime = st->btime; ino->i_ino = st->ino; ino->i_mode = st->mode; return hostfs_inode_update(ino, st); @@ -567,7 +569,10 @@ static int hostfs_inode_test(struct inode *inode, void *data) const struct hostfs_stat *st = data; dev_t dev = MKDEV(st->dev.maj, st->dev.min); - return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev; + return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev && + (inode->i_mode & S_IFMT) == (st->mode & S_IFMT) && + HOSTFS_I(inode)->btime.tv_sec == st->btime.tv_sec && + HOSTFS_I(inode)->btime.tv_nsec == st->btime.tv_nsec; } static struct inode *hostfs_iget(struct super_block *sb, char *name) diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 97e9c40a9448..3bcd9f35e70b 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -18,39 +18,48 @@ #include "hostfs.h" #include <utime.h> -static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) +static void statx_to_hostfs(const struct statx *buf, struct hostfs_stat *p) { - p->ino = buf->st_ino; - p->mode = buf->st_mode; - p->nlink = buf->st_nlink; - p->uid = buf->st_uid; - p->gid = buf->st_gid; - p->size = buf->st_size; - p->atime.tv_sec = buf->st_atime; - p->atime.tv_nsec = 0; - p->ctime.tv_sec = buf->st_ctime; - p->ctime.tv_nsec = 0; - p->mtime.tv_sec = buf->st_mtime; - p->mtime.tv_nsec = 0; - p->blksize = buf->st_blksize; - p->blocks = buf->st_blocks; - p->rdev.maj = os_major(buf->st_rdev); - p->rdev.min = os_minor(buf->st_rdev); - p->dev.maj = os_major(buf->st_dev); - p->dev.min = os_minor(buf->st_dev); + p->ino = buf->stx_ino; + p->mode = buf->stx_mode; + p->nlink = buf->stx_nlink; + p->uid = buf->stx_uid; + p->gid = buf->stx_gid; + p->size = buf->stx_size; + p->atime.tv_sec = buf->stx_atime.tv_sec; + p->atime.tv_nsec = buf->stx_atime.tv_nsec; + p->ctime.tv_sec = buf->stx_ctime.tv_sec; + p->ctime.tv_nsec = buf->stx_ctime.tv_nsec; + p->mtime.tv_sec = buf->stx_mtime.tv_sec; + p->mtime.tv_nsec = buf->stx_mtime.tv_nsec; + if (buf->stx_mask & STATX_BTIME) { + p->btime.tv_sec = buf->stx_btime.tv_sec; + p->btime.tv_nsec = buf->stx_btime.tv_nsec; + } else { + memset(&p->btime, 0, sizeof(p->btime)); + } + p->blksize = buf->stx_blksize; + p->blocks = buf->stx_blocks; + p->rdev.maj = buf->stx_rdev_major; + p->rdev.min = buf->stx_rdev_minor; + p->dev.maj = buf->stx_dev_major; + p->dev.min = buf->stx_dev_minor; } int stat_file(const char *path, struct hostfs_stat *p, int fd) { - struct stat64 buf; + struct statx buf; + int flags = AT_SYMLINK_NOFOLLOW; if (fd >= 0) { - if (fstat64(fd, &buf) < 0) - return -errno; - } else if (lstat64(path, &buf) < 0) { - return -errno; + flags |= AT_EMPTY_PATH; + path = ""; } - stat64_to_hostfs(&buf, p); + + if ((statx(fd, path, flags, STATX_BASIC_STATS | STATX_BTIME, &buf)) < 0) + return -errno; + + statx_to_hostfs(&buf, p); return 0; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index e7e6701806ad..7ffdf0d037fa 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -224,7 +224,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, } /* truncate len if we find any trailing uptodate block(s) */ - for ( ; i <= last; i++) { + while (++i <= last) { if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index eb2f8273e6f1..09df40b612fb 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -147,7 +147,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, de = tmpde; } /* Basic sanity check, whether name doesn't exceed dir entry */ - if (de_len < de->name_len[0] + + if (de_len < sizeof(struct iso_directory_record) || + de_len < de->name_len[0] + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" " in block %lu of inode %lu\n", block, diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 35768a63fb1d..421d247fae52 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb, return NULL; return isofs_export_iget(sb, - fh_len > 2 ? ifid->parent_block : 0, + fh_len > 3 ? ifid->parent_block : 0, ifid->parent_offset, fh_len > 4 ? ifid->parent_generation : 0); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dfbb8f73861f..ddde73299d62 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1914,7 +1914,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, /* Log is no longer empty */ write_lock(&journal->j_state_lock); - WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 421c0d360836..19ec32537483 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -286,21 +286,22 @@ static int fc_do_one_pass(journal_t *journal, int jbd2_journal_recover(journal_t *journal) { int err, err2; - journal_superblock_t * sb; - struct recovery_info info; errseq_t wb_err; struct address_space *mapping; memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. */ - if (!sb->s_start) { + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index f9009e4f9ffd..0e1019382cf5 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -204,6 +204,10 @@ int dbMount(struct inode *ipbmap) bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); + if (!bmp->db_agwidth) { + err = -EINVAL; + goto err_release_metapage; + } bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG || @@ -3403,7 +3407,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) oldl2agsize = bmp->db_agl2size; bmp->db_agl2size = l2agsize; - bmp->db_agsize = 1 << l2agsize; + bmp->db_agsize = (s64)1 << l2agsize; /* compute new number of AG */ agno = bmp->db_numag; @@ -3666,8 +3670,8 @@ void dbFinalizeBmap(struct inode *ipbmap) * system size is not a multiple of the group size). */ inactfree = (inactags && ag_rem) ? - ((inactags - 1) << bmp->db_agl2size) + ag_rem - : inactags << bmp->db_agl2size; + (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem + : ((s64)inactags << bmp->db_agl2size); /* determine how many free blocks are in the active * allocation groups plus the average number of free blocks diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 8f85177f284b..93db6eec4465 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -117,7 +117,8 @@ do { \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ - ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ + ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \ + ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index b30e4cf2f579..9a6d504228e7 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -102,7 +102,7 @@ int diMount(struct inode *ipimap) * allocate/initialize the in-memory inode map control structure */ /* allocate the in-memory inode map control structure. */ - imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); + imap = kzalloc(sizeof(struct inomap), GFP_KERNEL); if (imap == NULL) return -ENOMEM; @@ -456,7 +456,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) dp += inum % 8; /* 8 inodes per 4K page */ /* copy on-disk inode to in-memory inode */ - if ((copy_from_dinode(dp, ip)) != 0) { + if ((copy_from_dinode(dp, ip) != 0) || (ip->i_nlink == 0)) { /* handle bad return by returning NULL for ip */ set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 7252941bf165..b3b08c5ae701 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -559,11 +559,16 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { - int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); - - printk(KERN_ERR "ea_get: invalid extended attribute\n"); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, - ea_buf->xattr, size, 1); + if (unlikely(EALIST_SIZE(ea_buf->xattr) > INT_MAX)) { + printk(KERN_ERR "ea_get: extended attribute size too large: %u > INT_MAX\n", + EALIST_SIZE(ea_buf->xattr)); + } else { + int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); + + printk(KERN_ERR "ea_get: invalid extended attribute\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, size, 1); + } ea_release(inode, ea_buf); rc = -EIO; goto clean_up; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index b068ed32d7b3..f6e2a4523f7e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1560,8 +1560,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn) * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of - * being removed. Once kernfs_break_active_protection() is invoked, that - * protection is irreversibly gone for the kernfs operation instance. + * being drained and removed. Once kernfs_break_active_protection() is + * invoked, that protection is irreversibly gone for the kernfs operation + * instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 332d08d2fe0d..6b90fea6cca2 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -820,8 +820,9 @@ bool kernfs_should_drain_open_files(struct kernfs_node *kn) /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. + * Callers post kernfs_unbreak_active_protection may be counted in + * kn->active by now, do not WARN_ON because of them. */ - WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); diff --git a/fs/namespace.c b/fs/namespace.c index b4385e2413d5..eab9185e2285 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -633,15 +633,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); - smp_mb(); // see mntput_no_expire() + smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; - if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { - mnt_add_count(mnt, -1); - return 1; - } lock_mount_hash(); - if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; @@ -1786,6 +1782,7 @@ static int do_umount(struct mount *mnt, int flags) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { + smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { @@ -1869,6 +1866,7 @@ static void warn_mandlock(void) static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); + struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; @@ -1878,7 +1876,7 @@ static int can_umount(const struct path *path, int flags) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; - if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } @@ -2438,56 +2436,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) struct vfsmount *mnt = path->mnt; struct dentry *dentry; struct mountpoint *mp = ERR_PTR(-ENOENT); + struct path under = {}; for (;;) { - struct mount *m; + struct mount *m = real_mount(mnt); if (beneath) { - m = real_mount(mnt); + path_put(&under); read_seqlock_excl(&mount_lock); - dentry = dget(m->mnt_mountpoint); + under.mnt = mntget(&m->mnt_parent->mnt); + under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); + dentry = under.dentry; } else { dentry = path->dentry; } inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - goto out; - } - namespace_lock(); - if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { + if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) + break; // not to be mounted on + + if (beneath && unlikely(m->mnt_mountpoint != dentry || + &m->mnt_parent->mnt != under.mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); - goto out; + continue; // got moved } mnt = lookup_mnt(path); - if (likely(!mnt)) + if (unlikely(mnt)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + continue; // got overmounted + } + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) break; - - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - dput(dentry); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - } - - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); + if (beneath) { + /* + * @under duplicates the references that will stay + * at least until namespace_unlock(), so the path_put() + * below is safe (and OK to do under namespace_lock - + * we are not dropping the final references here). + */ + path_put(&under); + } + return mp; } - -out: + namespace_unlock(); + inode_unlock(dentry->d_inode); if (beneath) - dput(dentry); - + path_put(&under); return mp; } @@ -2498,14 +2502,11 @@ static inline struct mountpoint *lock_mount(struct path *path) static void unlock_mount(struct mountpoint *where) { - struct dentry *dentry = where->m_dentry; - + inode_unlock(where->m_dentry->d_inode); read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); - namespace_unlock(); - inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) @@ -2556,6 +2557,10 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); + if (!check_mnt(mnt)) { + err = -EINVAL; + goto out_unlock; + } if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) @@ -2994,7 +2999,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) if (IS_MNT_SLAVE(from)) { struct mount *m = from->mnt_master; - list_add(&to->mnt_slave, &m->mnt_slave_list); + list_add(&to->mnt_slave, &from->mnt_slave); to->mnt_master = m; } @@ -3019,18 +3024,25 @@ out: * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * - * Context: This function expects namespace_lock() to be held. + * Context: namespace_sem must be held at least shared. + * MUST NOT be called under lock_mount_hash() (there one should just + * call __lookup_mnt() and check if it returns NULL). * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { + unsigned seq = read_seqbegin(&mount_lock); + bool no_child; + rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { - rcu_read_unlock(); - return true; - } + no_child = !__lookup_mnt(path->mnt, path->dentry); rcu_read_unlock(); - return false; + if (need_seqretry(&mount_lock, seq)) { + read_seqlock_excl(&mount_lock); + no_child = !__lookup_mnt(path->mnt, path->dentry); + read_sequnlock_excl(&mount_lock); + } + return unlikely(!no_child); } /** diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 7df2503cef6c..2d99f5e7a686 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -2,6 +2,7 @@ config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING && MULTIUSER + select CRC32 select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL @@ -194,7 +195,6 @@ config NFS_USE_KERNEL_DNS config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG - select CRC32 default y config NFS_DISABLE_UDP_SUPPORT diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 62607d52bfa5..aa09f930eeaf 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1080,6 +1080,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 4bf2526a3a18..bbd582d8a7dc 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -297,7 +297,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (delegation->inode && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); @@ -570,17 +571,6 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; - else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) { - struct inode *inode; - - spin_lock(&delegation->lock); - inode = delegation->inode; - if (inode && list_empty(&NFS_I(inode)->open_files)) - ret = true; - spin_unlock(&delegation->lock); - } - if (ret) - clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) @@ -821,11 +811,25 @@ int nfs4_inode_make_writeable(struct inode *inode) return nfs4_inode_return_delegation(inode); } -static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) +static void +nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + struct inode *inode; + + if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) + return; + spin_lock(&delegation->lock); + inode = delegation->inode; + if (!inode) + goto out; + if (list_empty(&NFS_I(inode)->open_files)) + nfs_mark_return_delegation(server, delegation); + else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out: + spin_unlock(&delegation->lock); } static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 39f7549afcf5..389186384235 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2642,6 +2642,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) unblock_revalidate(new_dentry); } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2729,7 +2741,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index acf4b88889dc..d5f1fbfd9a0c 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -75,6 +75,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct page *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -158,8 +159,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -169,7 +169,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 2b3c5eea1f13..0bc537de1b29 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1255,6 +1255,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -ECONNRESET: case -EHOSTDOWN: case -EHOSTUNREACH: + case -ENETDOWN: case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e028f5a0ef5f..d21c5ecfbf1c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 56bbf59bda3c..06230baaa554 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(nfs_current_task_exiting())) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a92b234ae087..c29ad2e1d416 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -859,18 +859,16 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } -#ifdef CONFIG_CRC32 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } -#else -static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) + +static inline bool nfs_current_task_exiting(void) { - return 0; + return (current->flags & PF_EXITING) != 0; } -#endif static inline bool nfs_error_is_fatal(int err) { diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 4bf208a0a8e9..715753f41fb0 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); + } while (!fatal_signal_pending(current) && !nfs_current_task_exiting()); return res; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4b12e45f5753..1b94a55215e7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -422,6 +422,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) @@ -433,6 +435,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) @@ -1712,7 +1716,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + !nfs_current_task_exiting()) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -3494,7 +3499,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || nfs_current_task_exiting()) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) @@ -6880,10 +6885,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -6891,7 +6904,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 351616c61df5..f9c291e2165c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); } -#ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id * @session - pointer to session */ #define nfs_session_id_hash(sess_id) \ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) -#else -#define nfs_session_id_hash(session) (0) -#endif #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 794bb4aa588d..9fc71dc090c2 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2741,7 +2741,15 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); - ssleep(1); + switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + nfs_mark_client_ready(clp, -EIO); + break; + default: + ssleep(1); + break; + } out_drain: memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index fe83c681e3fe..73aa5a63afe3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -732,6 +732,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -1180,6 +1188,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d886c8226d8f..79996d7dad0f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -59,6 +59,7 @@ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; refcount_t ds_count; unsigned long ds_state; @@ -405,7 +406,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 88e061bd711b..1b317c44da12 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -651,12 +651,12 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct net *net, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) - if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + if (ds->ds_net == net && _same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; } @@ -763,7 +763,7 @@ out_err: * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -781,13 +781,14 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + tmp_ds = _data_server_lookup_locked(net, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e1bcad5906ae..4e72ee57fc8f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1021,6 +1021,16 @@ int nfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); /* + * The SB_RDONLY flag has been removed from the superblock during + * mounts to prevent interference between different filesystems. + * Similarly, it is also necessary to ignore the SB_RDONLY flag + * during reconfiguration; otherwise, it may also result in the + * creation of redundant superblocks when mounting a directory with + * different rw and ro flags multiple times. + */ + fc->sb_flags_mask &= ~SB_RDONLY; + + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which * ones were explicitly specified. Fall back to legacy behavior and @@ -1277,8 +1287,17 @@ int nfs_get_tree_common(struct fs_context *fc) if (IS_ERR(server)) return PTR_ERR(server); + /* + * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a + * superblock among each filesystem that mounts sub-directories + * belonging to a single exported root path. + * To prevent interference between different filesystems, the + * SB_RDONLY flag should be removed from the superblock. + */ if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + else + fc->sb_flags &= ~SB_RDONLY; /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 7b59a40d40c0..784f7c1d003b 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -14,6 +14,7 @@ #include <linux/rcupdate.h> #include <linux/lockd/lockd.h> +#include "internal.h" #include "nfs4_fs.h" #include "netns.h" #include "sysfs.h" @@ -228,6 +229,25 @@ static void shutdown_client(struct rpc_clnt *clnt) rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); } +/* + * Shut down the nfs_client only once all the superblocks + * have been shut down. + */ +static void shutdown_nfs_client(struct nfs_client *clp) +{ + struct nfs_server *server; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->flags & NFS_MOUNT_SHUTDOWN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + nfs_mark_client_ready(clp, -EIO); + shutdown_client(clp->cl_rpcclient); +} + static ssize_t shutdown_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -259,7 +279,6 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, server->flags |= NFS_MOUNT_SHUTDOWN; shutdown_client(server->client); - shutdown_client(server->nfs_client->cl_rpcclient); if (!IS_ERR(server->client_acl)) shutdown_client(server->client_acl); @@ -267,6 +286,7 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, if (server->nlm_host) shutdown_client(server->nlm_host->h_rpcclnt); out: + shutdown_nfs_client(server->nfs_client); return count; } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 43b88eaf0673..05c10f70456c 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -4,6 +4,7 @@ config NFSD depends on INET depends on FILE_LOCKING depends on FSNOTIFY + select CRC32 select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a25cb2ff1b0b..e2875706e6bf 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1066,6 +1066,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) return openlockstateid(stid); } +/* + * As the sc_free callback of deleg, this may be called by nfs4_put_stid + * in nfsd_break_one_deleg. + * Considering nfsd_break_one_deleg is called with the flc->flc_lock held, + * this function mustn't ever sleep. + */ static void nfs4_free_deleg(struct nfs4_stid *stid) { struct nfs4_delegation *dp = delegstateid(stid); @@ -4920,6 +4926,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { + bool queued; /* * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease @@ -4928,7 +4935,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); - WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall)); + queued = nfsd4_run_cb(&dp->dl_recall); + WARN_ON_ONCE(!queued); + if (!queued) + refcount_dec(&dp->dl_stid.sc_count); } /* Called from break_lease() with flc_lock held. */ @@ -6279,14 +6289,19 @@ deleg_reaper(struct nfsd_net *nn) spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); - if (clp->cl_state != NFSD4_ACTIVE || - list_empty(&clp->cl_delegations) || - atomic_read(&clp->cl_delegs_in_recall) || - test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || - (ktime_get_boottime_seconds() - - clp->cl_ra_time < 5)) { + + if (clp->cl_state != NFSD4_ACTIVE) + continue; + if (list_empty(&clp->cl_delegations)) + continue; + if (atomic_read(&clp->cl_delegs_in_recall)) + continue; + if (test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags)) + continue; + if (ktime_get_boottime_seconds() - clp->cl_ra_time < 5) + continue; + if (clp->cl_cb_state != NFSD4_CB_UP) continue; - } list_add(&clp->cl_ra_cblist, &cblist); /* release in nfsd4_cb_recall_any_release */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 40426f899e76..f1420d3510d2 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -263,7 +263,6 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1, return true; } -#ifdef CONFIG_CRC32 /** * knfsd_fh_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle @@ -275,12 +274,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size); } -#else -static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) -{ - return 0; -} -#endif /** * fh_clear_pre_post_attrs - Reset pre/post attributes diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index dbd27a44632f..5e70a3478afe 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2094,11 +2094,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { - if (unlikely(ret == -ENOENT)) + if (unlikely(ret == -ENOENT)) { nilfs_crit(btree->b_inode->i_sb, "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); + ret = -EINVAL; + } goto out; } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 893ab36824cc..2d8dc6b35b54 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap, dat = nilfs_bmap_get_dat(bmap); key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(bmap, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -EINVAL; + if (!buffer_nilfs_volatile(bh)) { oldreq.pr_entry_nr = ptr; newreq.pr_entry_nr = ptr; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 2ecd0303f942..4aea45821611 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -335,6 +335,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, } if (extend_init && !is_compressed(ni)) { + WARN_ON(ni->i_valid >= pos); err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos); if (err) goto out; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 9089c58a005c..191b91ffadbb 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -618,7 +618,7 @@ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes) u32 off = le32_to_cpu(hdr->de_off); if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot || - off + sizeof(struct NTFS_DE) > end) { + size_add(off, sizeof(struct NTFS_DE)) > end) { /* incorrect index buffer. */ return false; } @@ -736,7 +736,7 @@ fill_table: if (end > total) return NULL; - if (off + sizeof(struct NTFS_DE) > end) + if (size_add(off, sizeof(struct NTFS_DE)) > end) return NULL; e = Add2Ptr(hdr, off); @@ -2184,6 +2184,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); + if (!e) { + err = -EINVAL; + goto out; + } if (!de_is_last(e)) { /* @@ -2205,6 +2209,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, n = fnd->nodes[level]; te = hdr_first_de(&n->index->ihdr); + if (!te) { + err = -EINVAL; + goto out; + } /* Copy the candidate entry into the replacement entry buffer. */ re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); if (!re) { diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 964e27c7b901..c1d1c4a7cf4d 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -717,7 +717,7 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr) struct NTFS_DE *e; u16 esize; - if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used ) + if (de_off >= used || size_add(de_off, sizeof(struct NTFS_DE)) > used) return NULL; e = Add2Ptr(hdr, de_off); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f0937902f7b4..e6191249169e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1796,6 +1796,14 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, el = root_el; while (el->l_tree_depth) { + if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has invalid tree depth %u in extent list\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; + } if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), "Owner %llu has empty extent list at depth %u\n", diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index cbe3c12ff5f7..b246e5327111 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -174,7 +174,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) struct ocfs2_recovery_map *rm; mutex_init(&osb->recovery_lock); - osb->disable_recovery = 0; + osb->recovery_state = OCFS2_REC_ENABLED; osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); @@ -190,31 +190,53 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) return 0; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) { - mb(); return osb->recovery_thread_task != NULL; } -void ocfs2_recovery_exit(struct ocfs2_super *osb) +static void ocfs2_recovery_disable(struct ocfs2_super *osb, + enum ocfs2_recovery_state state) { - struct ocfs2_recovery_map *rm; - - /* disable any new recovery threads and wait for any currently - * running ones to exit. Do this before setting the vol_state. */ mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; + /* + * If recovery thread is not running, we can directly transition to + * final state. + */ + if (!ocfs2_recovery_thread_running(osb)) { + osb->recovery_state = state + 1; + goto out_lock; + } + osb->recovery_state = state; + /* Wait for recovery thread to acknowledge state transition */ + wait_event_cmd(osb->recovery_event, + !ocfs2_recovery_thread_running(osb) || + osb->recovery_state >= state + 1, + mutex_unlock(&osb->recovery_lock), + mutex_lock(&osb->recovery_lock)); +out_lock: mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ + /* + * At this point we know that no more recovery work can be queued so + * wait for any recovery completion work to complete. + */ if (osb->ocfs2_wq) flush_workqueue(osb->ocfs2_wq); +} + +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) +{ + ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE); /* * Now that recovery is shut down, and the osb is about to be @@ -1472,6 +1494,18 @@ static int __ocfs2_recovery_thread(void *arg) } } restart: + if (quota_enabled) { + mutex_lock(&osb->recovery_lock); + /* Confirm that recovery thread will no longer recover quotas */ + if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { + osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; + wake_up(&osb->recovery_event); + } + if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) + quota_enabled = 0; + mutex_unlock(&osb->recovery_lock); + } + status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); @@ -1569,27 +1603,29 @@ bail: ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; - mb(); /* sync with ocfs2_recovery_thread_running */ + if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) + osb->recovery_state = OCFS2_REC_DISABLED; wake_up(&osb->recovery_event); mutex_unlock(&osb->recovery_lock); - if (quota_enabled) - kfree(rm_quota); + kfree(rm_quota); return status; } void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) { + int was_set = -1; + mutex_lock(&osb->recovery_lock); + if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) + was_set = ocfs2_recovery_map_set(osb, node_num); trace_ocfs2_recovery_thread(node_num, osb->node_num, - osb->disable_recovery, osb->recovery_thread_task, - osb->disable_recovery ? - -1 : ocfs2_recovery_map_set(osb, node_num)); + osb->recovery_state, osb->recovery_thread_task, was_set); - if (osb->disable_recovery) + if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE) goto out; if (osb->recovery_thread_task) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index e3c3a35dc5e0..6397170f302f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); void ocfs2_free_replay_slots(struct ocfs2_super *osb); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8fe826143d7b..5e3ebbab698a 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -308,6 +308,21 @@ enum ocfs2_journal_trigger_type { void ocfs2_initialize_journal_triggers(struct super_block *sb, struct ocfs2_triggers triggers[]); +enum ocfs2_recovery_state { + OCFS2_REC_ENABLED = 0, + OCFS2_REC_QUOTA_WANT_DISABLE, + /* + * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for + * ocfs2_recovery_disable_quota() to work. + */ + OCFS2_REC_QUOTA_DISABLED, + OCFS2_REC_WANT_DISABLE, + /* + * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work + */ + OCFS2_REC_DISABLED, +}; + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -370,7 +385,7 @@ struct ocfs2_super struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; - int disable_recovery; + enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 4b4fa58cd32f..c7bda48b5fb2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -453,8 +453,7 @@ out: /* Sync changes in local quota file into global quota file and * reinitialize local quota file. - * The function expects local quota file to be already locked and - * s_umount locked in shared mode. */ + * The function expects local quota file to be already locked. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) @@ -585,7 +584,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, { unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; - struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; handle_t *handle; @@ -597,7 +595,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); - down_read(&sb->s_umount); for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; @@ -674,8 +671,7 @@ out_put: break; } out: - up_read(&sb->s_umount); - kfree(rec); + ocfs2_free_quota_recovery(rec); return status; } @@ -840,8 +836,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); /* - * s_umount held in exclusive mode protects us against racing with - * recovery thread... + * ocfs2_dismount_volume() has already aborted quota recovery... */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 84fa585c6513..e585e77cdc88 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1870,6 +1870,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); + /* Stop quota recovery so that we can disable quotas */ + ocfs2_recovery_disable_quota(osb); + ocfs2_disable_quotas(osb); /* All dquots should be freed by now */ diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 085912268442..dd4dc70e4aaa 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -23,9 +23,9 @@ static int orangefs_writepage_locked(struct page *page, struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; - size_t len, wlen; + size_t wlen; ssize_t ret; - loff_t off; + loff_t len, off; set_page_writeback(page); @@ -92,8 +92,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, struct orangefs_write_range *wrp, wr; struct iov_iter iter; ssize_t ret; - size_t len; - loff_t off; + loff_t len, off; int i; len = i_size_read(inode); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 981967e507b3..23b5c5f9c782 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -506,8 +506,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); int ovl_ensure_verity_loaded(struct path *path); -int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, - u8 *digest_buf, int *buf_length); int ovl_validate_verity(struct ovl_fs *ofs, struct path *metapath, struct path *datapath); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 2c056d737c27..93ee57bc82ad 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1180,6 +1180,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, return ERR_PTR(-EINVAL); } + if (ctx->nr == ctx->nr_data) { + pr_err("at least one non-data lowerdir is required\n"); + return ERR_PTR(-EINVAL); + } + err = -EINVAL; for (i = 0; i < ctx->nr; i++) { l = &ctx->lower[i]; diff --git a/fs/proc/base.c b/fs/proc/base.c index 91fe20b7657c..d444155581ca 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -416,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = { #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. - * Returns the resolved symbol. If that fails, simply return the address. + * Returns the resolved symbol to user space. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 775ce0bcf08c..c8785d68e870 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -557,10 +557,16 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } -static inline void pde_set_flags(struct proc_dir_entry *pde) +static void pde_set_flags(struct proc_dir_entry *pde) { if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) pde->flags |= PROC_ENTRY_PERMANENT; + if (pde->proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (pde->proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif } struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, @@ -624,6 +630,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_seq_private); @@ -654,6 +661,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, return NULL; p->proc_ops = &proc_single_ops; p->single_show = show; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_single_data); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 532dc9d240f7..897c71077a0f 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -679,13 +679,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (de->proc_ops->proc_compat_ioctl) { - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_compat_ioctl(de)) { + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9a8f32f21ff5..445c74a39a93 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -84,6 +84,20 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde) pde->flags |= PROC_ENTRY_PERMANENT; } +static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_read_iter; +} + +static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) +{ +#ifdef CONFIG_COMPAT + return pde->flags & PROC_ENTRY_proc_compat_ioctl; +#else + return false; +#endif +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 7dbbf3b6d98d..a1c97cd2720a 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -264,7 +264,7 @@ static void parse_options(char *options) static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) - seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); + seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes); return 0; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 801d6c0b170c..a0fc51196910 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,7 @@ #include <linux/time.h> #include <linux/pstore.h> -extern unsigned long kmsg_bytes; +extern unsigned int kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); @@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {} extern struct pstore_info *psinfo; -extern void pstore_set_kmsg_bytes(int); +extern void pstore_set_kmsg_bytes(unsigned int bytes); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 03425928d2fb..ef62389212b6 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -92,8 +92,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* How much of the kernel log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; -module_param(kmsg_bytes, ulong, 0444); +unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, uint, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); static void *compress_workspace; @@ -107,9 +107,9 @@ static void *compress_workspace; static char *big_oops_buf; static size_t max_compressed_size; -void pstore_set_kmsg_bytes(int bytes) +void pstore_set_kmsg_bytes(unsigned int bytes) { - kmsg_bytes = bytes; + WRITE_ONCE(kmsg_bytes, bytes); } /* Tag each group of saved records with a sequence number */ @@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { struct kmsg_dump_iter iter; + unsigned int remaining = READ_ONCE(kmsg_bytes); unsigned long total = 0; const char *why; unsigned int part = 1; @@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, kmsg_dump_rewind(&iter); oopscount++; - while (total < kmsg_bytes) { + while (total < remaining) { char *dst; size_t dst_size; int header_size; diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 9c0ef4195b58..749794667295 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -29,7 +29,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, { struct cached_fid *cfid; - spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { if (!strcmp(cfid->path, path)) { /* @@ -38,25 +37,20 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, * being deleted due to a lease break. */ if (!cfid->time || !cfid->has_lease) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } kref_get(&cfid->refcount); - spin_unlock(&cfids->cfid_list_lock); return cfid; } } if (lookup_only) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } if (cfids->num_entries >= max_cached_dirs) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } cfid = init_cached_dir(path); if (cfid == NULL) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } cfid->cfids = cfids; @@ -74,7 +68,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, */ cfid->has_lease = true; - spin_unlock(&cfids->cfid_list_lock); return cfid; } @@ -185,8 +178,10 @@ replay_again: if (!utf16_path) return -ENOMEM; + spin_lock(&cfids->cfid_list_lock); cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs); if (cfid == NULL) { + spin_unlock(&cfids->cfid_list_lock); kfree(utf16_path); return -ENOENT; } @@ -195,7 +190,6 @@ replay_again: * Otherwise, it is either a new entry or laundromat worker removed it * from @cfids->entries. Caller will put last reference if the latter. */ - spin_lock(&cfids->cfid_list_lock); if (cfid->has_lease && cfid->time) { spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 1fc1683b15bd..bf32bc22ebd6 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -778,7 +778,8 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, } /* validate that we do not go past end of acl */ - if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { + if (end_of_acl < (char *)pdacl + sizeof(struct smb_acl) || + end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { cifs_dbg(VFS, "ACL too small to parse DACL\n"); return; } @@ -799,15 +800,34 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, if (num_aces > 0) { umode_t denied_mode = 0; - if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) + if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) / + (offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth) + sizeof(__le16))) return; + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); if (!ppace) return; for (i = 0; i < num_aces; ++i) { + if (end_of_acl - acl_base < acl_size) + break; + ppace[i] = (struct smb_ace *) (acl_base + acl_size); + acl_base = (char *)ppace[i]; + acl_size = offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth); + + if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth == 0 || + ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + (end_of_acl - acl_base < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || + (le16_to_cpu(ppace[i]->size) < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth)) + break; + #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif @@ -851,7 +871,6 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, (void *)ppace[i], sizeof(struct smb_ace)); */ - acl_base = (char *)ppace[i]; acl_size = le16_to_cpu(ppace[i]->size); } diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index c46d418c1c0c..ca33f6cd6a80 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -1226,10 +1226,9 @@ typedef struct smb_com_query_information_rsp { typedef struct smb_com_setattr_req { struct smb_hdr hdr; /* wct = 8 */ __le16 attr; - __le16 time_low; - __le16 time_high; + __le32 last_write_time; __le16 reserved[5]; /* must be zero */ - __u16 ByteCount; + __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ unsigned char fileName[]; } __attribute__((packed)) SETATTR_REQ; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 85b0a30493a6..c6d325666b5c 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -31,6 +31,9 @@ extern void cifs_small_buf_release(void *); extern void free_rsp_buf(int, void *); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); +extern int smb_send_kvec(struct TCP_Server_Info *server, + struct msghdr *msg, + size_t *sent); extern unsigned int _get_xid(void); extern void _free_xid(unsigned int); #define get_xid() \ @@ -158,6 +161,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file); +extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file); extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); @@ -393,6 +398,10 @@ extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon); extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon, struct kstatfs *FSData); +extern int SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, __le32 attributes, __le64 write_time, + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const FILE_BASIC_INFO *data, const struct nls_table *nls_codepage, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 769950adb776..c36ab20050c1 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -2738,10 +2738,10 @@ int cifs_query_reparse_point(const unsigned int xid, io_req->TotalParameterCount = 0; io_req->TotalDataCount = 0; - io_req->MaxParameterCount = cpu_to_le32(2); + io_req->MaxParameterCount = cpu_to_le32(0); /* BB find exact data count max from sess structure BB */ io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); - io_req->MaxSetupCount = 4; + io_req->MaxSetupCount = 1; io_req->Reserved = 0; io_req->ParameterOffset = 0; io_req->DataCount = 0; @@ -2768,6 +2768,22 @@ int cifs_query_reparse_point(const unsigned int xid, goto error; } + /* SetupCount must be 1, otherwise offset to ByteCount is incorrect. */ + if (io_rsp->SetupCount != 1) { + rc = -EIO; + goto error; + } + + /* + * ReturnedDataLen is output length of executed IOCTL. + * DataCount is output length transferred over network. + * Check that we have full FSCTL_GET_REPARSE_POINT buffer. + */ + if (data_count != le16_to_cpu(io_rsp->ReturnedDataLen)) { + rc = -EIO; + goto error; + } + end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount; start = (__u8 *)&io_rsp->hdr.Protocol + data_offset; if (start >= end) { @@ -5157,6 +5173,63 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +int +SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, __le32 attributes, __le64 write_time, + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb) +{ + SETATTR_REQ *pSMB; + SETATTR_RSP *pSMBr; + struct timespec64 ts; + int bytes_returned; + int name_len; + int rc; + + cifs_dbg(FYI, "In %s path %s\n", __func__, fileName); + +retry: + rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->fileName, + fileName, PATH_MAX, nls_codepage, + cifs_remap(cifs_sb)); + name_len++; /* trailing null */ + name_len *= 2; + } else { + name_len = copy_path_name(pSMB->fileName, fileName); + } + /* Only few attributes can be set by this command, others are not accepted by Win9x. */ + pSMB->attr = cpu_to_le16(le32_to_cpu(attributes) & + (ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | ATTR_ARCHIVE)); + /* Zero write time value (in both NT and SETATTR formats) means to not change it. */ + if (le64_to_cpu(write_time) != 0) { + ts = cifs_NTtimeToUnix(write_time); + pSMB->last_write_time = cpu_to_le32(ts.tv_sec); + } + pSMB->BufferFormat = 0x04; + name_len++; /* account for buffer type byte */ + inc_rfc1001_len(pSMB, (__u16)name_len); + pSMB->ByteCount = cpu_to_le16(name_len); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Send error in %s = %d\n", __func__, rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto retry; + + return rc; +} + /* Some legacy servers such as NT4 require that the file times be set on an open handle, rather than by pathname - this is awkward due to potential access conflicts on the open, but it is unavoidable for these diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 198681d14153..3faaee33ad45 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1003,13 +1003,9 @@ clean_demultiplex_info(struct TCP_Server_Info *server) msleep(125); if (cifs_rdma_enabled(server)) smbd_destroy(server); - if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; - - /* Release netns reference for the socket. */ - put_net(cifs_net_ns(server)); } if (!list_empty(&server->pending_mid_q)) { @@ -1058,7 +1054,6 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } - /* Release netns reference for this server. */ put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server->hostname); @@ -1730,8 +1725,6 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; - - /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); @@ -1863,7 +1856,6 @@ smbd_connected: out_err_crypto_release: cifs_crypto_secmech_release(tcp_ses); - /* Release netns reference for this server. */ put_net(cifs_net_ns(tcp_ses)); out_err: @@ -1872,10 +1864,8 @@ out_err: cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); - if (tcp_ses->ssocket) { + if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); - put_net(cifs_net_ns(tcp_ses)); - } kfree(tcp_ses); } return ERR_PTR(rc); @@ -2487,6 +2477,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) return 0; if (tcon->nodelete != ctx->nodelete) return 0; + if (tcon->posix_extensions != ctx->linux_ext) + return 0; return 1; } @@ -3059,8 +3051,10 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet req = {}; - struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + struct msghdr msg = {}; + struct kvec iov = {}; unsigned int len; + size_t sent; req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); @@ -3089,10 +3083,18 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * As per rfc1002, @len must be the number of bytes that follows the * length field of a rfc1002 session request payload. */ - len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + len = sizeof(req.trailer.session_req); + req.type = RFC1002_SESSION_REQUEST; + req.flags = 0; + req.length = cpu_to_be16(len); + len += offsetof(typeof(req), trailer.session_req); + iov.iov_base = &req; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, len); + rc = smb_send_kvec(server, &msg, &sent); + if (rc < 0 || len != sent) + return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED; - smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); - rc = smb_send(server, smb_buf, len); /* * RFC1001 layer in at least one server requires very short break before * negprot presumably because not expecting negprot to follow so fast. @@ -3101,7 +3103,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) */ usleep_range(1000, 2000); - return rc; + return 0; } static int @@ -3137,20 +3139,20 @@ generic_ip_connect(struct TCP_Server_Info *server) socket = server->ssocket; } else { struct net *net = cifs_net_ns(server); + struct sock *sk; - rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); + rc = __sock_create(net, sfamily, SOCK_STREAM, + IPPROTO_TCP, &server->ssocket, 1); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } - /* - * Grab netns reference for the socket. - * - * It'll be released here, on error, or in clean_demultiplex_info() upon server - * teardown. - */ - get_net(net); + sk = server->ssocket->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); @@ -3164,10 +3166,8 @@ generic_ip_connect(struct TCP_Server_Info *server) } rc = bind_socket(server); - if (rc < 0) { - put_net(cifs_net_ns(server)); + if (rc < 0) return rc; - } /* * Eventually check for other socket options to change from @@ -3204,7 +3204,6 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); - put_net(cifs_net_ns(server)); sock_release(socket); server->ssocket = NULL; return rc; @@ -3213,9 +3212,6 @@ generic_ip_connect(struct TCP_Server_Info *server) if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); - if (rc < 0) - put_net(cifs_net_ns(server)); - return rc; } @@ -3960,11 +3956,13 @@ int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, struct TCP_Server_Info *server) { + bool in_retry = false; int rc = 0; if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; +retry: /* only send once per connect */ spin_lock(&server->srv_lock); if (server->tcpStatus != CifsGood && @@ -3984,6 +3982,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); + if (rc == -EAGAIN) { + /* Allow one retry attempt */ + if (!in_retry) { + in_retry = true; + goto retry; + } + rc = -EHOSTDOWN; + } if (rc == 0) { spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index cb75b95efb70..d883ed75022c 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -816,6 +816,11 @@ int cifs_open(struct inode *inode, struct file *file) } else { _cifsFileInfo_put(cfile, true, false); } + } else { + /* hard link on the defeered close file */ + rc = cifs_get_hardlink_path(tcon, inode, file); + if (rc) + cifs_close_deferred_file(CIFS_I(inode)); } if (server->oplocks) @@ -1878,6 +1883,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest) list_move(li, dest); } +int +cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file) +{ + struct cifsFileInfo *open_file = NULL; + struct cifsInodeInfo *cinode = CIFS_I(inode); + int rc = 0; + + spin_lock(&tcon->open_file_lock); + spin_lock(&cinode->open_file_lock); + + list_for_each_entry(open_file, &cinode->openFileList, flist) { + if (file->f_flags == open_file->f_flags) { + rc = -EINVAL; + break; + } + } + + spin_unlock(&cinode->open_file_lock); + spin_unlock(&tcon->open_file_lock); + return rc; +} + void cifs_free_llist(struct list_head *llist) { diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index b90cc918de7a..137d03781d52 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1249,6 +1249,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; + ctx->vol_rsize = ctx->rsize; break; case Opt_wsize: ctx->wsize = result.uint_32; @@ -1264,6 +1265,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->wsize, PAGE_SIZE); } } + ctx->vol_wsize = ctx->wsize; break; case Opt_acregmax: if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { @@ -1299,6 +1301,11 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->closetimeo = HZ * result.uint_32; break; case Opt_echo_interval: + if (result.uint_32 < SMB_ECHO_INTERVAL_MIN || + result.uint_32 > SMB_ECHO_INTERVAL_MAX) { + cifs_errorf(fc, "echo interval is out of bounds\n"); + goto cifs_parse_mount_err; + } ctx->echo_interval = result.uint_32; break; case Opt_snapshot: diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index bbd2063ab838..d0a2043ea446 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -253,6 +253,9 @@ struct smb3_fs_context { bool use_client_guid:1; /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; + /* User-specified original r/wsize value */ + unsigned int vol_rsize; + unsigned int vol_wsize; unsigned int bsize; unsigned int rasize; unsigned int rsize; diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index b9cf05e0940d..d93ebd58ecae 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1145,6 +1145,16 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, cifs_create_junction_fattr(fattr, sb); goto out; } + /* + * If the reparse point is unsupported by the Linux SMB + * client then let it process by the SMB server. So mask + * the -EOPNOTSUPP error code. This will allow Linux SMB + * client to send SMB OPEN request to server. If server + * does not support this reparse point too then server + * will return error during open the path. + */ + if (rc == -EOPNOTSUPP) + rc = 0; } break; } diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index d86da949a919..007da0a699cb 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -257,7 +257,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; - FILE_ALL_INFO file_info; + struct cifs_open_info_data query_data; oparms = (struct cifs_open_parms) { .tcon = tcon, @@ -269,11 +269,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, &file_info); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data); if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { rc = -ENOENT; /* it's not a symlink */ goto out; @@ -312,7 +312,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, NULL); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 75929a0a56f9..e616be8196de 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -733,7 +733,10 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, else cifs_buf_release(cfile->srch_inf. ntwrk_buf_start); + /* Reset all pointers to the network buffer to prevent stale references */ cfile->srch_inf.ntwrk_buf_start = NULL; + cfile->srch_inf.srch_entries_start = NULL; + cfile->srch_inf.last_entry = NULL; } rc = initiate_cifs_search(xid, file, full_path); if (rc) { @@ -756,11 +759,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, search_flags, &cfile->srch_inf); + if (rc) + return -ENOENT; /* FindFirst/Next set last_entry to NULL on malformed reply */ if (cfile->srch_inf.last_entry) cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); - if (rc) - return -ENOENT; } if (index_to_find < cfile->srch_inf.index_of_last_entry) { /* we found the buffer that contains the entry */ diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index bb246ef0458f..b6556fe3dfa1 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -633,8 +633,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf, const char *full_path, bool unicode, struct cifs_open_info_data *data) { - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -658,8 +656,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf, } return 0; default: - cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", - le32_to_cpu(buf->ReparseTag)); return -EOPNOTSUPP; } } diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index c2a98b273664..f04922eb45d4 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -732,6 +732,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) *pbcc_area = bcc_ptr; } +static void +ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + + strcpy(bcc_ptr, "Linux version "); + bcc_ptr += strlen("Linux version "); + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; + + strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); + bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + + *pbcc_area = bcc_ptr; +} + static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -756,6 +772,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, *pbcc_area = bcc_ptr; } +static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int len; + + /* copy domain */ + if (ses->domainName != NULL) { + len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + if (WARN_ON_ONCE(len < 0)) + len = CIFS_MAX_DOMAINNAME_LEN - 1; + bcc_ptr += len; + } /* else we send a null domain name so server will default to its own domain */ + *bcc_ptr = 0; + bcc_ptr++; + + *pbcc_area = bcc_ptr; +} + static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -801,25 +836,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ - /* copy domain */ - if (ses->domainName != NULL) { - len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); - if (WARN_ON_ONCE(len < 0)) - len = CIFS_MAX_DOMAINNAME_LEN - 1; - bcc_ptr += len; - } /* else we send a null domain name so server will default to its own domain */ - *bcc_ptr = 0; - bcc_ptr++; - /* BB check for overflow here */ - strcpy(bcc_ptr, "Linux version "); - bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, init_utsname()->release); - bcc_ptr += strlen(init_utsname()->release) + 1; - - strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); - bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + ascii_domain_string(&bcc_ptr, ses, nls_cp); + ascii_oslm_strings(&bcc_ptr, nls_cp); *pbcc_area = bcc_ptr; } @@ -1622,7 +1642,7 @@ sess_auth_kerberos(struct sess_data *sess_data) sess_data->iov[1].iov_len = msg->secblob_len; pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); - if (ses->capabilities & CAP_UNICODE) { + if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) { /* unicode strings must be word aligned */ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; @@ -1631,8 +1651,8 @@ sess_auth_kerberos(struct sess_data *sess_data) unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } else { - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); + ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } sess_data->iov[2].iov_len = (long) bcc_ptr - diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index bc1bac36c1b2..e62d9cc592e0 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -426,13 +426,6 @@ cifs_negotiate(const unsigned int xid, { int rc; rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) { - /* retry only once on 1st time connection */ - set_credits(server, 1); - rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) - rc = -EHOSTDOWN; - } return rc; } @@ -444,8 +437,8 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - if (ctx->wsize) - wsize = ctx->wsize; + if (ctx->got_wsize) + wsize = ctx->vol_wsize; else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) wsize = CIFS_DEFAULT_IOSIZE; else @@ -497,7 +490,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) else defsize = server->maxBuf - sizeof(READ_RSP); - rsize = ctx->rsize ? ctx->rsize : defsize; + rsize = ctx->got_rsize ? ctx->vol_rsize : defsize; /* * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to @@ -548,24 +541,104 @@ static int cifs_query_path_info(const unsigned int xid, const char *full_path, struct cifs_open_info_data *data) { - int rc; + int rc = -EOPNOTSUPP; FILE_ALL_INFO fi = {}; + struct cifs_search_info search_info = {}; + bool non_unicode_wildcard = false; data->symlink = false; data->adjust_tz = false; - /* could do find first instead but this returns more info */ - rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, - cifs_remap(cifs_sb)); /* - * BB optimize code so we do not make the above call when server claims - * no NT SMB support and the above call failed at least once - set flag - * in tcon or mount. + * First try CIFSSMBQPathInfo() function which returns more info + * (NumberOfLinks) than CIFSFindFirst() fallback function. + * Some servers like Win9x do not support SMB_QUERY_FILE_ALL_INFO over + * TRANS2_QUERY_PATH_INFORMATION, but supports it with filehandle over + * TRANS2_QUERY_FILE_INFORMATION (function CIFSSMBQFileInfo(). But SMB + * Open command on non-NT servers works only for files, does not work + * for directories. And moreover Win9x SMB server returns bogus data in + * SMB_QUERY_FILE_ALL_INFO Attributes field. So for non-NT servers, + * do not even use CIFSSMBQPathInfo() or CIFSSMBQFileInfo() function. */ - if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { + if (tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + + /* + * Non-UNICODE variant of fallback functions below expands wildcards, + * so they cannot be used for querying paths with wildcard characters. + */ + if (rc && !(tcon->ses->capabilities & CAP_UNICODE) && strpbrk(full_path, "*?\"><")) + non_unicode_wildcard = true; + + /* + * Then fallback to CIFSFindFirst() which works also with non-NT servers + * but does not does not provide NumberOfLinks. + */ + if ((rc == -EOPNOTSUPP || rc == -EINVAL) && + !non_unicode_wildcard) { + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) + search_info.info_level = SMB_FIND_FILE_INFO_STANDARD; + else + search_info.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; + rc = CIFSFindFirst(xid, tcon, full_path, cifs_sb, NULL, + CIFS_SEARCH_CLOSE_ALWAYS | CIFS_SEARCH_CLOSE_AT_END, + &search_info, false); + if (rc == 0) { + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) { + FIND_FILE_STANDARD_INFO *di; + int offset = tcon->ses->server->timeAdj; + + di = (FIND_FILE_STANDARD_INFO *)search_info.srch_entries_start; + fi.CreationTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->CreationDate, di->CreationTime, offset))); + fi.LastAccessTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->LastAccessDate, di->LastAccessTime, offset))); + fi.LastWriteTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->LastWriteDate, di->LastWriteTime, offset))); + fi.ChangeTime = fi.LastWriteTime; + fi.Attributes = cpu_to_le32(le16_to_cpu(di->Attributes)); + fi.AllocationSize = cpu_to_le64(le32_to_cpu(di->AllocationSize)); + fi.EndOfFile = cpu_to_le64(le32_to_cpu(di->DataSize)); + } else { + FILE_FULL_DIRECTORY_INFO *di; + + di = (FILE_FULL_DIRECTORY_INFO *)search_info.srch_entries_start; + fi.CreationTime = di->CreationTime; + fi.LastAccessTime = di->LastAccessTime; + fi.LastWriteTime = di->LastWriteTime; + fi.ChangeTime = di->ChangeTime; + fi.Attributes = di->ExtFileAttributes; + fi.AllocationSize = di->AllocationSize; + fi.EndOfFile = di->EndOfFile; + fi.EASize = di->EaSize; + } + fi.NumberOfLinks = cpu_to_le32(1); + fi.DeletePending = 0; + fi.Directory = !!(le32_to_cpu(fi.Attributes) & ATTR_DIRECTORY); + cifs_buf_release(search_info.ntwrk_buf_start); + } else if (!full_path[0]) { + /* + * CIFSFindFirst() does not work on root path if the + * root path was exported on the server from the top + * level path (drive letter). + */ + rc = -EOPNOTSUPP; + } + } + + /* + * If everything failed then fallback to the legacy SMB command + * SMB_COM_QUERY_INFORMATION which works with all servers, but + * provide just few information. + */ + if ((rc == -EOPNOTSUPP || rc == -EINVAL) && !non_unicode_wildcard) { rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); data->adjust_tz = true; + } else if ((rc == -EOPNOTSUPP || rc == -EINVAL) && non_unicode_wildcard) { + /* Path with non-UNICODE wildcard character cannot exist. */ + rc = -ENOENT; } if (!rc) { @@ -597,6 +670,42 @@ static int cifs_query_path_info(const unsigned int xid, CIFSSMBClose(xid, tcon, fid.netfid); } +#ifdef CONFIG_CIFS_XATTR + /* + * For WSL CHR and BLK reparse points it is required to fetch + * EA $LXDEV which contains major and minor device numbers. + */ + if (!rc && data->reparse_point) { + struct smb2_file_full_ea_info *ea; + + ea = (struct smb2_file_full_ea_info *)data->wsl.eas; + rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV, + &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1], + SMB2_WSL_XATTR_DEV_SIZE, cifs_sb); + if (rc == SMB2_WSL_XATTR_DEV_SIZE) { + ea->next_entry_offset = cpu_to_le32(0); + ea->flags = 0; + ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN; + ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE); + memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1); + data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + + SMB2_WSL_XATTR_DEV_SIZE; + rc = 0; + } else if (rc >= 0) { + /* It is an error if EA $LXDEV has wrong size. */ + rc = -EINVAL; + } else { + /* + * In all other cases ignore error if fetching + * of EA $LXDEV failed. It is needed only for + * WSL CHR and BLK reparse points and wsl_to_fattr() + * handle the case when EA is missing. + */ + rc = 0; + } + } +#endif + return rc; } @@ -626,6 +735,13 @@ static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; FILE_ALL_INFO fi = {}; + /* + * CIFSSMBQFileInfo() for non-NT servers returns bogus data in + * Attributes fields. So do not use this command for non-NT servers. + */ + if (!(tcon->ses->capabilities & CAP_NT_SMBS)) + return -EOPNOTSUPP; + if (cfile->symlink_target) { data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!data->symlink_target) @@ -796,6 +912,9 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_fid fid; struct cifs_open_parms oparms; struct cifsFileInfo *open_file; + FILE_BASIC_INFO new_buf; + struct cifs_open_info_data query_data; + __le64 write_time = buf->LastWriteTime; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; @@ -803,20 +922,58 @@ smb_set_file_info(struct inode *inode, const char *full_path, /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY); + if (open_file) { fid.netfid = open_file->fid.netfid; netpid = open_file->pid; tcon = tlink_tcon(open_file->tlink); - goto set_via_filehandle; + } else { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + tcon = tlink_tcon(tlink); } - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - tlink = NULL; - goto out; + /* + * Non-NT servers interprets zero time value in SMB_SET_FILE_BASIC_INFO + * over TRANS2_SET_FILE_INFORMATION as a valid time value. NT servers + * interprets zero time value as do not change existing value on server. + * API of ->set_file_info() callback expects that zero time value has + * the NT meaning - do not change. Therefore if server is non-NT and + * some time values in "buf" are zero, then fetch missing time values. + */ + if (!(tcon->ses->capabilities & CAP_NT_SMBS) && + (!buf->CreationTime || !buf->LastAccessTime || + !buf->LastWriteTime || !buf->ChangeTime)) { + rc = cifs_query_path_info(xid, tcon, cifs_sb, full_path, &query_data); + if (rc) { + if (open_file) { + cifsFileInfo_put(open_file); + open_file = NULL; + } + goto out; + } + /* + * Original write_time from buf->LastWriteTime is preserved + * as SMBSetInformation() interprets zero as do not change. + */ + new_buf = *buf; + buf = &new_buf; + if (!buf->CreationTime) + buf->CreationTime = query_data.fi.CreationTime; + if (!buf->LastAccessTime) + buf->LastAccessTime = query_data.fi.LastAccessTime; + if (!buf->LastWriteTime) + buf->LastWriteTime = query_data.fi.LastWriteTime; + if (!buf->ChangeTime) + buf->ChangeTime = query_data.fi.ChangeTime; } - tcon = tlink_tcon(tlink); + + if (open_file) + goto set_via_filehandle; rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, cifs_sb); @@ -837,8 +994,45 @@ smb_set_file_info(struct inode *inode, const char *full_path, .fid = &fid, }; - cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); - rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (S_ISDIR(inode->i_mode) && !(tcon->ses->capabilities & CAP_NT_SMBS)) { + /* Opening directory path is not possible on non-NT servers. */ + rc = -EOPNOTSUPP; + } else { + /* + * Use cifs_open_file() instead of CIFS_open() as the + * cifs_open_file() selects the correct function which + * works also on non-NT servers. + */ + rc = cifs_open_file(xid, &oparms, &oplock, NULL); + /* + * Opening path for writing on non-NT servers is not + * possible when the read-only attribute is already set. + * Non-NT server in this case returns -EACCES. For those + * servers the only possible way how to clear the read-only + * bit is via SMB_COM_SETATTR command. + */ + if (rc == -EACCES && + (cinode->cifsAttrs & ATTR_READONLY) && + le32_to_cpu(buf->Attributes) != 0 && /* 0 = do not change attrs */ + !(le32_to_cpu(buf->Attributes) & ATTR_READONLY) && + !(tcon->ses->capabilities & CAP_NT_SMBS)) + rc = -EOPNOTSUPP; + } + + /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */ + if (rc == -EOPNOTSUPP) { + cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n"); + rc = SMBSetInformation(xid, tcon, full_path, + buf->Attributes != 0 ? buf->Attributes : cpu_to_le32(cinode->cifsAttrs), + write_time, + cifs_sb->local_nls, cifs_sb); + if (rc == 0) + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + else + rc = -EACCES; + goto out; + } + if (rc != 0) { if (rc == -EIO) rc = -EINVAL; @@ -846,6 +1040,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, } netpid = current->tgid; + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for attrs/times not supported by this server\n"); set_via_filehandle: rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid); @@ -856,6 +1051,21 @@ set_via_filehandle: CIFSSMBClose(xid, tcon, fid.netfid); else cifsFileInfo_put(open_file); + + /* + * Setting the read-only bit is not honered on non-NT servers when done + * via open-semantics. So for setting it, use SMB_COM_SETATTR command. + * This command works only after the file is closed, so use it only when + * operation was called without the filehandle. + */ + if (open_file == NULL && + !(tcon->ses->capabilities & CAP_NT_SMBS) && + le32_to_cpu(buf->Attributes) & ATTR_READONLY) { + SMBSetInformation(xid, tcon, full_path, + buf->Attributes, + 0 /* do not change write time */, + cifs_sb->local_nls, cifs_sb); + } out: if (tlink != NULL) cifs_put_tlink(tlink); diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index db9c807115c6..d7f2835e0b1c 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -107,16 +107,25 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; + bool retry_without_read_attributes = false; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); if (smb2_path == NULL) return -ENOMEM; - oparms->desired_access |= FILE_READ_ATTRIBUTES; + if (!(oparms->desired_access & FILE_READ_ATTRIBUTES)) { + oparms->desired_access |= FILE_READ_ATTRIBUTES; + retry_without_read_attributes = true; + } smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, &err_buftype); + if (rc == -EACCES && retry_without_read_attributes) { + oparms->desired_access &= ~FILE_READ_ATTRIBUTES; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + } if (rc && data) { struct smb2_hdr *hdr = err_iov.iov_base; diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 677ef6f99a5b..fadc5fc274eb 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -816,11 +816,12 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative"); spin_unlock(&cifs_tcp_ses_lock); - if (tcon->ses) + if (tcon->ses) { server = tcon->ses->server; - - cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", - tcon->tid, persistent_fid, volatile_fid); + cifs_server_dbg(FYI, + "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", + tcon->tid, persistent_fid, volatile_fid); + } return 0; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index b809a616728f..4e3eacbec96d 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -428,12 +428,20 @@ smb2_negotiate(const unsigned int xid, server->CurrentMid = 0; spin_unlock(&server->mid_lock); rc = SMB2_negotiate(xid, ses, server); - /* BB we probably don't need to retry with modern servers */ - if (rc == -EAGAIN) - rc = -EHOSTDOWN; return rc; } +static inline unsigned int +prevent_zero_iosize(unsigned int size, const char *type) +{ + if (size == 0) { + cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n", + type, CIFS_MIN_DEFAULT_IOSIZE); + return CIFS_MIN_DEFAULT_IOSIZE; + } + return size; +} + static unsigned int smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { @@ -441,12 +449,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -456,7 +464,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -478,7 +486,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -488,13 +496,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } static unsigned int @@ -504,7 +512,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -527,7 +535,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } /* diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 0af3535e08f3..3e88e8b3c16e 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2932,6 +2932,7 @@ replay_again: req->CreateContextsOffset = cpu_to_le32( sizeof(struct smb2_create_req) + iov[1].iov_len); + le32_add_cpu(&req->CreateContextsLength, iov[n_iov-1].iov_len); pc_buf = iov[n_iov-1].iov_base; } @@ -2978,7 +2979,7 @@ replay_again: /* Eventually save off posix specific response info and timestaps */ err_free_rsp_buf: - free_rsp_buf(resp_buftype, rsp); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); kfree(pc_buf); err_free_req: cifs_small_buf_release(req); diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index ddf1a3aafee5..2269963e5008 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -178,7 +178,7 @@ delete_mid(struct mid_q_entry *mid) * Our basic "send data to server" function. Should be called with srv_mutex * held. The caller is responsible for handling the results. */ -static int +int smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, size_t *sent) { diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c3ee42188d25..1af827ae757e 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -95,6 +95,9 @@ */ #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) +/* According to MS-SMB2 specification The minimum recommended value is 65536.*/ +#define CIFS_MIN_DEFAULT_IOSIZE (65536) + /* * SMB2 Header Definition * diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 58380a986af5..f4b20b80af06 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -546,7 +546,19 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, retval = -ENOMEM; goto out; } - sess->user = user; + + if (!sess->user) { + /* First successful authentication */ + sess->user = user; + } else { + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_debug(AUTH, "different user tried to reuse session\n"); + retval = -EPERM; + ksmbd_free_user(user); + goto out; + } + ksmbd_free_user(user); + } memcpy(sess->sess_key, resp->payload, resp->session_key_len); memcpy(out_blob, resp->payload + resp->session_key_len, @@ -1012,9 +1024,9 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, ses_enc_key = enc ? sess->smb3encryptionkey : sess->smb3decryptionkey; - if (enc) - ksmbd_user_session_get(sess); memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + if (!enc) + ksmbd_user_session_put(sess); return 0; } @@ -1213,7 +1225,7 @@ free_iv: free_sg: kfree(sg); free_req: - kfree(req); + aead_request_free(req); free_ctx: ksmbd_release_crypto_ctx(ctx); return rc; diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 9a134181df61..82dcc86a32c5 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -180,7 +180,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) down_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (atomic_read(&sess->refcnt) == 0 && + if (atomic_read(&sess->refcnt) <= 1 && (sess->state != SMB2_SESSION_VALID || time_after(jiffies, sess->last_active + SMB2_SESSION_TIMEOUT))) { @@ -229,7 +229,11 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) if (!ksmbd_chann_del(conn, sess) && xa_empty(&sess->ksmbd_chann_list)) { hash_del(&sess->hlist); - ksmbd_session_destroy(sess); + down_write(&conn->session_lock); + xa_erase(&conn->sessions, sess->id); + up_write(&conn->session_lock); + if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } } } @@ -248,13 +252,30 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) if (xa_empty(&sess->ksmbd_chann_list)) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); - ksmbd_session_destroy(sess); + if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } } up_write(&conn->session_lock); up_write(&sessions_table_lock); } +bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn, + unsigned long long id) +{ + struct ksmbd_session *sess; + + down_read(&conn->session_lock); + sess = xa_load(&conn->sessions, id); + if (sess) { + up_read(&conn->session_lock); + return true; + } + up_read(&conn->session_lock); + + return false; +} + struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id) { @@ -308,8 +329,8 @@ void ksmbd_user_session_put(struct ksmbd_session *sess) if (atomic_read(&sess->refcnt) <= 0) WARN_ON(1); - else - atomic_dec(&sess->refcnt); + else if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, @@ -414,7 +435,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); - atomic_set(&sess->refcnt, 1); + atomic_set(&sess->refcnt, 2); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index c1c4b20bd5c6..f21348381d59 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -87,6 +87,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess); struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id); struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id); +bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn, + unsigned long long id); int ksmbd_session_register(struct ksmbd_conn *conn, struct ksmbd_session *sess); void ksmbd_sessions_deregister(struct ksmbd_conn *conn); diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 11e82a14a40a..e564432643ea 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -129,14 +129,6 @@ static void free_opinfo(struct oplock_info *opinfo) kfree(opinfo); } -static inline void opinfo_free_rcu(struct rcu_head *rcu_head) -{ - struct oplock_info *opinfo; - - opinfo = container_of(rcu_head, struct oplock_info, rcu_head); - free_opinfo(opinfo); -} - struct oplock_info *opinfo_get(struct ksmbd_file *fp) { struct oplock_info *opinfo; @@ -154,12 +146,9 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) { struct oplock_info *opinfo; - if (list_empty(&ci->m_op_list)) - return NULL; - - rcu_read_lock(); - opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info, - op_entry); + down_read(&ci->m_lock); + opinfo = list_first_entry_or_null(&ci->m_op_list, struct oplock_info, + op_entry); if (opinfo) { if (opinfo->conn == NULL || !atomic_inc_not_zero(&opinfo->refcount)) @@ -171,8 +160,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) } } } - - rcu_read_unlock(); + up_read(&ci->m_lock); return opinfo; } @@ -185,7 +173,7 @@ void opinfo_put(struct oplock_info *opinfo) if (!atomic_dec_and_test(&opinfo->refcount)) return; - call_rcu(&opinfo->rcu_head, opinfo_free_rcu); + free_opinfo(opinfo); } static void opinfo_add(struct oplock_info *opinfo) @@ -193,7 +181,7 @@ static void opinfo_add(struct oplock_info *opinfo) struct ksmbd_inode *ci = opinfo->o_fp->f_ci; down_write(&ci->m_lock); - list_add_rcu(&opinfo->op_entry, &ci->m_op_list); + list_add(&opinfo->op_entry, &ci->m_op_list); up_write(&ci->m_lock); } @@ -207,7 +195,7 @@ static void opinfo_del(struct oplock_info *opinfo) write_unlock(&lease_list_lock); } down_write(&ci->m_lock); - list_del_rcu(&opinfo->op_entry); + list_del(&opinfo->op_entry); up_write(&ci->m_lock); } @@ -724,8 +712,8 @@ static int smb2_oplock_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; + ksmbd_conn_r_count_inc(conn); if (opinfo->op_state == OPLOCK_ACK_WAIT) { - ksmbd_conn_r_count_inc(conn); INIT_WORK(&work->work, __smb2_oplock_break_noti); ksmbd_queue_work(work); @@ -833,8 +821,8 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; + ksmbd_conn_r_count_inc(conn); if (opinfo->op_state == OPLOCK_ACK_WAIT) { - ksmbd_conn_r_count_inc(conn); INIT_WORK(&work->work, __smb2_lease_break_noti); ksmbd_queue_work(work); wait_for_break_ack(opinfo); @@ -1347,8 +1335,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, ci = fp->f_ci; op = opinfo_get(fp); - rcu_read_lock(); - list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) { + down_read(&ci->m_lock); + list_for_each_entry(brk_op, &ci->m_op_list, op_entry) { if (brk_op->conn == NULL) continue; @@ -1358,7 +1346,6 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, if (ksmbd_conn_releasing(brk_op->conn)) continue; - rcu_read_unlock(); if (brk_op->is_lease && (brk_op->o_lease->state & (~(SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE)))) { @@ -1388,9 +1375,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE, NULL); next: opinfo_put(brk_op); - rcu_read_lock(); } - rcu_read_unlock(); + up_read(&ci->m_lock); if (op) opinfo_put(op); @@ -1505,6 +1491,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req) if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; + if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < + sizeof(struct create_lease_v2) - 4) + goto err_out; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; lreq->flags = lc->lcontext.LeaseFlags; @@ -1517,6 +1507,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req) } else { struct create_lease *lc = (struct create_lease *)cc; + if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < + sizeof(struct create_lease)) + goto err_out; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; lreq->flags = lc->lcontext.LeaseFlags; @@ -1524,6 +1518,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req) lreq->version = 1; } return lreq; +err_out: + kfree(lreq); + return NULL; } /** diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h index 59554b73f60c..0fe931485465 100644 --- a/fs/smb/server/oplock.h +++ b/fs/smb/server/oplock.h @@ -78,7 +78,6 @@ struct oplock_info { struct list_head lease_entry; wait_queue_head_t oplock_q; /* Other server threads */ wait_queue_head_t oplock_brk; /* oplock breaking wait */ - struct rcu_head rcu_head; }; struct lease_break_info { diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 58e5cc2b1f3e..9bd817427a34 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -632,6 +632,11 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) return name; } + if (*name == '\0') { + kfree(name); + return ERR_PTR(-EINVAL); + } + if (*name == '\\') { pr_err("not allow directory name included leading slash\n"); kfree(name); @@ -1599,9 +1604,6 @@ static int krb5_authenticate(struct ksmbd_work *work, if (prev_sess_id && prev_sess_id != sess->id) destroy_previous_session(conn, sess->user, prev_sess_id); - if (sess->state == SMB2_SESSION_VALID) - ksmbd_free_user(sess->user); - retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); if (retval) { @@ -1707,44 +1709,38 @@ int smb2_sess_setup(struct ksmbd_work *work) if (conn->dialect != sess->dialect) { rc = -EINVAL; - ksmbd_user_session_put(sess); goto out_err; } if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) { rc = -EINVAL; - ksmbd_user_session_put(sess); goto out_err; } if (strncmp(conn->ClientGUID, sess->ClientGUID, SMB2_CLIENT_GUID_SIZE)) { rc = -ENOENT; - ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_IN_PROGRESS) { rc = -EACCES; - ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_EXPIRED) { rc = -EFAULT; - ksmbd_user_session_put(sess); goto out_err; } - ksmbd_user_session_put(sess); if (ksmbd_conn_need_reconnect(conn)) { rc = -EFAULT; + ksmbd_user_session_put(sess); sess = NULL; goto out_err; } - sess = ksmbd_session_lookup(conn, sess_id); - if (!sess) { + if (is_ksmbd_session_in_connection(conn, sess_id)) { rc = -EACCES; goto out_err; } @@ -1910,6 +1906,8 @@ out_err: sess->last_active = jiffies; sess->state = SMB2_SESSION_EXPIRED; + ksmbd_user_session_put(sess); + work->sess = NULL; if (try_delay) { ksmbd_conn_set_need_reconnect(conn); ssleep(5); @@ -2235,13 +2233,14 @@ int smb2_session_logoff(struct ksmbd_work *work) return -ENOENT; } - ksmbd_destroy_file_table(&sess->file_table); down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; up_write(&conn->session_lock); - ksmbd_free_user(sess->user); - sess->user = NULL; + if (sess->user) { + ksmbd_free_user(sess->user); + sess->user = NULL; + } ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); rsp->StructureSize = cpu_to_le16(4); @@ -2704,6 +2703,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_reconn_v2_req)) { + err = -EINVAL; + goto out; + } + recon_v2 = (struct create_durable_reconn_v2_req *)context; persistent_id = recon_v2->Fid.PersistentFileId; dh_info->fp = ksmbd_lookup_durable_fd(persistent_id); @@ -2737,6 +2743,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_reconn_req)) { + err = -EINVAL; + goto out; + } + recon = (struct create_durable_reconn_req *)context; persistent_id = recon->Data.Fid.PersistentFileId; dh_info->fp = ksmbd_lookup_durable_fd(persistent_id); @@ -2762,6 +2775,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_req_v2)) { + err = -EINVAL; + goto out; + } + durable_v2_blob = (struct create_durable_req_v2 *)context; ksmbd_debug(SMB, "Request for durable v2 open\n"); diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index da8ed72f335d..b90f893762f4 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -270,6 +270,11 @@ static int sid_to_id(struct mnt_idmap *idmap, return -EIO; } + if (psid->num_subauth == 0) { + pr_err("%s: zero subauthorities!\n", __func__); + return -EIO; + } + if (sidtype == SIDOWNER) { kuid_t uid; uid_t id; @@ -398,7 +403,9 @@ static void parse_dacl(struct mnt_idmap *idmap, if (num_aces <= 0) return; - if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) + if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) / + (offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth) + sizeof(__le16))) return; ret = init_acl_state(&acl_state, num_aces); @@ -432,6 +439,7 @@ static void parse_dacl(struct mnt_idmap *idmap, offsetof(struct smb_sid, sub_auth); if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth == 0 || ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || (end_of_acl - acl_base < acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 2d7cd7f42f27..281101fd1f76 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -296,7 +296,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) server_conf.signing = req->signing; server_conf.tcp_port = req->tcp_port; server_conf.ipc_timeout = req->ipc_timeout * HZ; - server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL; + if (check_mul_overflow(req->deadtime, SMB_ECHO_INTERVAL, + &server_conf.deadtime)) { + ret = -EINVAL; + goto out; + } server_conf.share_fake_fscaps = req->share_fake_fscaps; ksmbd_init_domain(req->sub_auth); @@ -322,6 +326,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) ret |= ksmbd_set_work_group(req->work_group); ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req), req->ifc_list_sz); +out: if (ret) { pr_err("Server configuration error: %s %s %s\n", req->netbios_name, req->server_string, diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index d0c19ad9d014..3bbf23827060 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -426,10 +426,15 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n", *pos, count); + if (*pos >= XATTR_SIZE_MAX) { + pr_err("stream write position %lld is out of bounds\n", *pos); + return -EINVAL; + } + size = *pos + count; if (size > XATTR_SIZE_MAX) { size = XATTR_SIZE_MAX; - count = (*pos + count) - XATTR_SIZE_MAX; + count = XATTR_SIZE_MAX - *pos; } v_len = ksmbd_vfs_getcasexattr(idmap, @@ -496,7 +501,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, int err = 0; if (work->conn->connection_type) { - if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { + if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) || + S_ISDIR(file_inode(fp->filp)->i_mode)) { pr_err("no right to write(%pD)\n", fp->filp); err = -EACCES; goto out; diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 271a23abc82f..002f0864abee 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -644,21 +644,40 @@ __close_file_table_ids(struct ksmbd_file_table *ft, bool (*skip)(struct ksmbd_tree_connect *tcon, struct ksmbd_file *fp)) { - unsigned int id; - struct ksmbd_file *fp; - int num = 0; + struct ksmbd_file *fp; + unsigned int id = 0; + int num = 0; + + while (1) { + write_lock(&ft->lock); + fp = idr_get_next(ft->idr, &id); + if (!fp) { + write_unlock(&ft->lock); + break; + } - idr_for_each_entry(ft->idr, fp, id) { - if (skip(tcon, fp)) + if (skip(tcon, fp) || + !atomic_dec_and_test(&fp->refcount)) { + id++; + write_unlock(&ft->lock); continue; + } set_close_state_blocked_works(fp); + idr_remove(ft->idr, fp->volatile_id); + fp->volatile_id = KSMBD_NO_FID; + write_unlock(&ft->lock); + + down_write(&fp->f_ci->m_lock); + list_del_init(&fp->node); + up_write(&fp->f_ci->m_lock); - if (!atomic_dec_and_test(&fp->refcount)) - continue; __ksmbd_close_fd(ft, fp); + num++; + id++; } + return num; } diff --git a/fs/splice.c b/fs/splice.c index d983d375ff11..6f9b06bbb860 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -45,7 +45,7 @@ * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ -static noinline void noinline pipe_clear_nowait(struct file *file) +static noinline void pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 22e812808e5c..3a27d4268b3c 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -202,6 +202,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); + if (!msblk->devblksize) { + errorf(fc, "squashfs: unable to set blocksize\n"); + return -EINVAL; + } + msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e98c198f85b9..2f73119c7ec9 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -814,6 +814,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map) } map->oflags = UDF_BLK_MAPPED; map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + ret = 0; goto out_free; } diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 4f33a4a48886..b4071c9cf8c9 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -115,7 +115,7 @@ void udf_truncate_tail_extent(struct inode *inode) } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ - if (ret == 0) + if (ret >= 0) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5d3e595f9da9..5ceb1fa8eb11 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -452,32 +452,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) goto out; /* - * If it's already released don't get it. This avoids to loop - * in __get_user_pages if userfaultfd_release waits on the - * caller of handle_userfault to release the mmap_lock. - */ - if (unlikely(READ_ONCE(ctx->released))) { - /* - * Don't return VM_FAULT_SIGBUS in this case, so a non - * cooperative manager can close the uffd after the - * last UFFDIO_COPY, without risking to trigger an - * involuntary SIGBUS if the process was starting the - * userfaultfd while the userfaultfd was still armed - * (but after the last UFFDIO_COPY). If the uffd - * wasn't already closed when the userfault reached - * this point, that would normally be solved by - * userfaultfd_must_wait returning 'false'. - * - * If we were to return VM_FAULT_SIGBUS here, the non - * cooperative manager would be instead forced to - * always call UFFDIO_UNREGISTER before it can safely - * close the uffd. - */ - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -513,6 +487,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; + if (unlikely(READ_ONCE(ctx->released))) { + /* + * If a concurrent release is detected, do not return + * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always + * return VM_FAULT_RETRY with lock released proactively. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd, to avoid involuntary SIGBUS triggered. + * + * If we were to return VM_FAULT_NOPAGE, it would work for + * the fault path, in which the lock will be released + * later. However for GUP, faultin_page() does nothing + * special on NOPAGE, so GUP would spin retrying without + * releasing the mmap read lock, causing possible livelock. + * + * Here only VM_FAULT_RETRY would make sure the mmap lock + * be released immediately, so that the thread concurrently + * releasing the userfault would always make progress. + */ + release_fault_lock(vmf); + goto out; + } + /* take the reference before dropping the mmap_lock */ userfaultfd_ctx_get(ctx); diff --git a/fs/xattr.c b/fs/xattr.c index c20046548f21..5fed22c22a2b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1291,6 +1291,15 @@ static bool xattr_is_trusted(const char *name) return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } +static bool xattr_is_maclabel(const char *name) +{ + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(suffix); +} + /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs @@ -1323,6 +1332,17 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (err) return err; + err = security_inode_listsecurity(inode, buffer, remaining_size); + if (err < 0) + return err; + + if (buffer) { + if (remaining_size < err) + return -ERANGE; + buffer += err; + } + remaining_size -= err; + read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); @@ -1331,6 +1351,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (!trusted && xattr_is_trusted(xattr->name)) continue; + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; |