diff options
Diffstat (limited to 'fs')
66 files changed, 1549 insertions, 1114 deletions
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index adbb3a1edcbf..5156821bfe6a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -355,7 +355,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; - int ret; obj.indaddr = ADFS_I(inode)->indaddr; obj.name_len = 0; @@ -365,6 +364,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); - return ret; + return adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8c7f26f1fbb..605017eb9349 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1116,11 +1116,11 @@ out_free_interp: * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ - if (interpreter) { + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (alignment > ELF_MIN_ALIGN) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; @@ -1585,7 +1585,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); - strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + get_task_comm(psinfo->pr_fname, p); return 0; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 520a0f6a7d9e..183e5c4aed34 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -18,8 +18,7 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU - depends on !PPC_256K_PAGES # powerpc - depends on !PAGE_SIZE_256KB # hexagon + depends on PAGE_SIZE_LESS_THAN_256KB help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7d2c33cdbac6..7d305b974824 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3376,8 +3376,7 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); - if (inode->i_nlink == 0 && - (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + if (inode->i_nlink == 0) deleted_inode = true; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9d9304e712d9..5b9104b8e453 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (!fi) return -ENOMEM; + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + file->private_data = fi; } @@ -1541,7 +1546,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; - int want, got = 0; + int want = 0, got = 0; int retry_op = 0, read = 0; again: @@ -1556,13 +1561,14 @@ again: else ceph_start_io_read(inode); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; + want |= CEPH_CAP_FILE_LAZYIO; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); @@ -1696,7 +1702,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; + int err, want = 0, got; bool direct_lock = false; u32 map_flags; u64 pool_flags; @@ -1771,10 +1777,10 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index c57699d8408d..0fcba68f9a99 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -160,8 +160,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("client%llu send metrics to mds%d\n", - ceph_client_gid(mdsc->fsc->client), s->s_mds); ceph_con_send(&s->s_con, msg); return true; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 620c691af40e..a338a3ec0dc4 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) /* if root is the real CephFS root, we don't have quota realms */ if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; + /* MDS stray dirs have no quota realms */ + if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino)) + return false; /* otherwise, we can't know for sure */ return true; } @@ -494,10 +497,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) if (ci->i_max_bytes) { total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* For quota size less than 4MB, use 4KB block size */ + if (!total) { + total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } /* It is possible for a quota to be exceeded. * Report 'zero' in that case */ free = total > used ? total - used : 0; + /* For quota size less than 4KB, report the + * total=used=4KB,free=0 when quota is full + * and total=free=4KB, used=0 otherwise */ + if (!total) { + total = 1; + free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } } spin_unlock(&ci->i_ceph_lock); if (total) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bea89bdb534a..bf79f369aec6 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,8 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#include <uapi/linux/magic.h> + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); @@ -146,6 +148,7 @@ enum { Opt_mds_namespace, Opt_recover_session, Opt_source, + Opt_mon_addr, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -159,6 +162,7 @@ enum { Opt_quotadf, Opt_copyfrom, Opt_wsync, + Opt_pagecache, }; enum ceph_recover_session_mode { @@ -197,8 +201,10 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), {} }; @@ -228,9 +234,92 @@ static void canonicalize_path(char *path) } /* - * Parse the source parameter. Distinguish the server list from the path. + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); + return -EINVAL; + } + + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + ++fsid_start; /* start of cluster fsid */ + + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. + * + * New device syntax will looks like: + * <device_spec>=/<path> + * where + * <device_spec> is name@fsid.fsname + * <path> is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) * - * The source will look like: + * Old device syntax is: * <server_spec>[,<server_spec>...]:[<path>] * where * <server_spec> is <ip>[:<port>] @@ -263,24 +352,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = dev_name + strlen(dev_name); } - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') - return invalfc(fc, "No path or : separator in source"); + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); - ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log); - if (ret) - return ret; + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } fc->source = param->string; param->string = NULL; return 0; } +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { @@ -306,6 +415,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, param->string = NULL; break; case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; @@ -323,6 +434,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) @@ -455,6 +568,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; default: BUG(); } @@ -474,6 +593,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); + kfree(args->mon_addr); kfree(args); } @@ -517,6 +637,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } @@ -572,15 +696,22 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); - if (fsopt->mds_namespace) + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -1052,6 +1183,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; @@ -1063,6 +1195,8 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); @@ -1148,6 +1282,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + } + sync_filesystem(fc->root->d_sb); return 0; } @@ -1325,6 +1466,14 @@ bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + module_init(init_ceph); module_exit(exit_ceph); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d0142cc5c41b..67f145e1ae7a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -24,13 +24,11 @@ #include <linux/fscache.h> #endif -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ @@ -44,6 +42,7 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ @@ -88,6 +87,8 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + bool new_dev_syntax; + /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() @@ -97,6 +98,7 @@ struct ceph_mount_options { char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ + char *mon_addr; }; struct ceph_fs_client { @@ -534,19 +536,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) * * These come from src/mds/mdstypes.h in the ceph sources. */ -#define CEPH_MAX_MDS 0x100 -#define CEPH_NUM_STRAY 10 +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { - if (vino.ino < CEPH_INO_SYSTEM_BASE && - vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { - WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); - return true; - } - return false; + if (vino.ino >= CEPH_INO_SYSTEM_BASE || + vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) + return false; + + /* Don't warn on mdsdirs */ + WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, + "Attempt to access reserved inode number 0x%llx", + vino.ino); + return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, diff --git a/fs/exec.c b/fs/exec.c index 82db656ca709..3c3c366a9bcf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1207,7 +1207,8 @@ static int unshare_sighand(struct task_struct *me) char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); - strncpy(buf, tsk->comm, buf_size); + /* Always NUL terminated and zero-padded */ + strscpy_pad(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } @@ -1222,7 +1223,7 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); - strlcpy(tsk->comm, buf, sizeof(tsk->comm)); + strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 7eea3cfd894d..f46a7339d6cf 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,7 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select FS_IOMAP select LZ4_COMPRESS if F2FS_FS_LZ4 select LZ4_DECOMPRESS if F2FS_FS_LZ4 select LZ4HC_COMPRESS if F2FS_FS_LZ4HC diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f1693d45bb78..982f0170639f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -664,7 +664,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - err = f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; @@ -1302,8 +1302,8 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long flags; if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) + + NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Disable nat_bits due to no space"); } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 49121a21f749..d0c3aeba5945 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -154,6 +154,7 @@ void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; + cc->valid_nr_cpages = 0; if (!reuse) cc->cluster_idx = NULL_CLUSTER; } @@ -620,7 +621,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) const struct f2fs_compress_ops *cops = f2fs_cops[fi->i_compress_algorithm]; unsigned int max_len, new_nr_cpages; - struct page **new_cpages; u32 chksum = 0; int i, ret; @@ -635,6 +635,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) max_len = COMPRESS_HEADER_SIZE + cc->clen; cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE); + cc->valid_nr_cpages = cc->nr_cpages; cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages); if (!cc->cpages) { @@ -685,13 +686,6 @@ static int f2fs_compress_pages(struct compress_ctx *cc) new_nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE); - /* Now we're going to cut unnecessary tail pages */ - new_cpages = page_array_alloc(cc->inode, new_nr_cpages); - if (!new_cpages) { - ret = -ENOMEM; - goto out_vunmap_cbuf; - } - /* zero out any unused part of the last page */ memset(&cc->cbuf->cdata[cc->clen], 0, (new_nr_cpages * PAGE_SIZE) - @@ -701,10 +695,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc) vm_unmap_ram(cc->rbuf, cc->cluster_size); for (i = 0; i < cc->nr_cpages; i++) { - if (i < new_nr_cpages) { - new_cpages[i] = cc->cpages[i]; + if (i < new_nr_cpages) continue; - } f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -712,9 +704,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) if (cops->destroy_compress_ctx) cops->destroy_compress_ctx(cc); - page_array_free(cc->inode, cc->cpages, cc->nr_cpages); - cc->cpages = new_cpages; - cc->nr_cpages = new_nr_cpages; + cc->valid_nr_cpages = new_nr_cpages; trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx, cc->clen, ret); @@ -1296,7 +1286,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, psize = (loff_t)(cc->rpages[last_index]->index + 1) << PAGE_SHIFT; - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto out_put_dnode; @@ -1308,14 +1298,14 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, cic->magic = F2FS_COMPRESSED_PAGE_MAGIC; cic->inode = inode; - atomic_set(&cic->pending_pages, cc->nr_cpages); + atomic_set(&cic->pending_pages, cc->valid_nr_cpages); cic->rpages = page_array_alloc(cc->inode, cc->cluster_size); if (!cic->rpages) goto out_put_cic; cic->nr_rpages = cc->cluster_size; - for (i = 0; i < cc->nr_cpages; i++) { + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_set_compressed_page(cc->cpages[i], inode, cc->rpages[i + 1]->index, cic); fio.compressed_page = cc->cpages[i]; @@ -1360,7 +1350,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, if (fio.compr_blocks && __is_valid_data_blkaddr(blkaddr)) fio.compr_blocks++; - if (i > cc->nr_cpages) { + if (i > cc->valid_nr_cpages) { if (__is_valid_data_blkaddr(blkaddr)) { f2fs_invalidate_blocks(sbi, blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -1385,8 +1375,8 @@ unlock_continue: if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); - f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); - add_compr_block_stat(inode, cc->nr_cpages); + f2fs_i_compr_blocks_update(inode, cc->valid_nr_cpages, true); + add_compr_block_stat(inode, cc->valid_nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) @@ -1424,9 +1414,7 @@ out_unlock_op: else f2fs_unlock_op(sbi); out_free: - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; + for (i = 0; i < cc->valid_nr_cpages; i++) { f2fs_compress_free_page(cc->cpages[i]); cc->cpages[i] = NULL; } @@ -1468,25 +1456,38 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, enum iostat_type io_type) { struct address_space *mapping = cc->inode->i_mapping; - int _submitted, compr_blocks, ret; - int i = -1, err = 0; + int _submitted, compr_blocks, ret, i; compr_blocks = f2fs_compressed_blocks(cc); - if (compr_blocks < 0) { - err = compr_blocks; - goto out_err; + + for (i = 0; i < cc->cluster_size; i++) { + if (!cc->rpages[i]) + continue; + + redirty_page_for_writepage(wbc, cc->rpages[i]); + unlock_page(cc->rpages[i]); } + if (compr_blocks < 0) + return compr_blocks; + for (i = 0; i < cc->cluster_size; i++) { if (!cc->rpages[i]) continue; retry_write: + lock_page(cc->rpages[i]); + if (cc->rpages[i]->mapping != mapping) { +continue_unlock: unlock_page(cc->rpages[i]); continue; } - BUG_ON(!PageLocked(cc->rpages[i])); + if (!PageDirty(cc->rpages[i])) + goto continue_unlock; + + if (!clear_page_dirty_for_io(cc->rpages[i])) + goto continue_unlock; ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted, NULL, NULL, wbc, io_type, @@ -1501,26 +1502,15 @@ retry_write: * avoid deadlock caused by cluster update race * from foreground operation. */ - if (IS_NOQUOTA(cc->inode)) { - err = 0; - goto out_err; - } + if (IS_NOQUOTA(cc->inode)) + return 0; ret = 0; cond_resched(); congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); - lock_page(cc->rpages[i]); - - if (!PageDirty(cc->rpages[i])) { - unlock_page(cc->rpages[i]); - continue; - } - - clear_page_dirty_for_io(cc->rpages[i]); goto retry_write; } - err = ret; - goto out_err; + return ret; } *submitted += _submitted; @@ -1529,14 +1519,6 @@ retry_write: f2fs_balance_fs(F2FS_M_SB(mapping), true); return 0; -out_err: - for (++i; i < cc->cluster_size; i++) { - if (!cc->rpages[i]) - continue; - redirty_page_for_writepage(wbc, cc->rpages[i]); - unlock_page(cc->rpages[i]); - } - return err; } int f2fs_write_multi_pages(struct compress_ctx *cc, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aacf5e4dcc57..0a1d236212f8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -21,6 +21,7 @@ #include <linux/cleancache.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" @@ -1354,7 +1355,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1376,61 +1377,9 @@ alloc: f2fs_invalidate_compress_page(sbi, old_blkaddr); } f2fs_update_data_blkaddr(dn, dn->data_blkaddr); - - /* - * i_size will be updated by direct_IO. Otherwise, we'll get stale - * data from unwritten block via dio_read. - */ return 0; } -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct f2fs_map_blocks map; - int flag; - int err = 0; - bool direct_io = iocb->ki_flags & IOCB_DIRECT; - - map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); - map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); - if (map.m_len > map.m_lblk) - map.m_len -= map.m_lblk; - else - map.m_len = 0; - - map.m_next_pgofs = NULL; - map.m_next_extent = NULL; - map.m_seg_type = NO_CHECK_TYPE; - map.m_may_create = true; - - if (direct_io) { - map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint); - flag = f2fs_force_buffered_io(inode, iocb, from) ? - F2FS_GET_BLOCK_PRE_AIO : - F2FS_GET_BLOCK_PRE_DIO; - goto map_blocks; - } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - if (f2fs_has_inline_data(inode)) - return err; - - flag = F2FS_GET_BLOCK_PRE_AIO; - -map_blocks: - err = f2fs_map_blocks(inode, &map, 1, flag); - if (map.m_len > 0 && err == -ENOSPC) { - if (!direct_io) - set_inode_flag(inode, FI_NO_PREALLOC); - err = 0; - } - return err; -} - void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) { if (flag == F2FS_GET_BLOCK_PRE_AIO) { @@ -1590,8 +1539,11 @@ next_block: flag != F2FS_GET_BLOCK_DIO); err = __allocate_data_block(&dn, map->m_seg_type); - if (!err) + if (!err) { + if (flag == F2FS_GET_BLOCK_PRE_DIO) + file_need_truncate(inode); set_inode_flag(inode, FI_APPEND_WRITE); + } } if (err) goto sync_out; @@ -1786,50 +1738,6 @@ static inline u64 blks_to_bytes(struct inode *inode, u64 blks) return (blks << inode->i_blkbits); } -static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag, - pgoff_t *next_pgofs, int seg_type, bool may_write) -{ - struct f2fs_map_blocks map; - int err; - - map.m_lblk = iblock; - map.m_len = bytes_to_blks(inode, bh->b_size); - map.m_next_pgofs = next_pgofs; - map.m_next_extent = NULL; - map.m_seg_type = seg_type; - map.m_may_create = may_write; - - err = f2fs_map_blocks(inode, &map, create, flag); - if (!err) { - map_bh(bh, inode->i_sb, map.m_pblk); - bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = blks_to_bytes(inode, map.m_len); - - if (map.m_multidev_dio) - bh->b_bdev = map.m_bdev; - } - return err; -} - -static int get_data_block_dio_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - true); -} - -static int get_data_block_dio(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) -{ - return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL, - f2fs_rw_hint_to_seg_type(inode->i_write_hint), - false); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1849,7 +1757,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -1881,7 +1789,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - err = f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { f2fs_put_page(page, 1); return err; @@ -2617,6 +2525,11 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + /* The below cases were checked when setting it. */ + if (f2fs_is_pinned_file(inode)) + return false; + if (fio && is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; if (f2fs_lfs_mode(sbi)) return true; if (S_ISDIR(inode->i_mode)) @@ -2625,8 +2538,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) @@ -2738,7 +2649,7 @@ got_it: fio->need_lock = LOCK_REQ; } - err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false); if (err) goto out_writepage; @@ -2987,6 +2898,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, .rpages = NULL, .nr_rpages = 0, .cpages = NULL, + .valid_nr_cpages = 0, .rbuf = NULL, .cbuf = NULL, .rlen = PAGE_SIZE * F2FS_I(inode)->i_cluster_size, @@ -3305,7 +3217,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct inode *inode, loff_t to) +void f2fs_write_failed(struct inode *inode, loff_t to) { loff_t i_size = i_size_read(inode); @@ -3339,12 +3251,10 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, int flag; /* - * we already allocated all the blocks, so we don't need to get - * the block addresses when there is no need to fill the page. + * If a whole page is being written and we already preallocated all the + * blocks, then there is no need to get a block address now. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && - !is_inode_flag_set(inode, FI_NO_PREALLOC) && - !f2fs_verity_in_progress(inode)) + if (len == PAGE_SIZE && is_inode_flag_set(inode, FI_PREALLOCATED_ALL)) return 0; /* f2fs_lock_op avoids race between write CP and convert_inline_page */ @@ -3595,158 +3505,6 @@ unlock_out: return copied; } -static int check_direct_IO(struct inode *inode, struct iov_iter *iter, - loff_t offset) -{ - unsigned i_blkbits = READ_ONCE(inode->i_blkbits); - unsigned blkbits = i_blkbits; - unsigned blocksize_mask = (1 << blkbits) - 1; - unsigned long align = offset | iov_iter_alignment(iter); - struct block_device *bdev = inode->i_sb->s_bdev; - - if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode)) - return 1; - - if (align & blocksize_mask) { - if (bdev) - blkbits = blksize_bits(bdev_logical_block_size(bdev)); - blocksize_mask = (1 << blkbits) - 1; - if (align & blocksize_mask) - return -EINVAL; - return 1; - } - return 0; -} - -static void f2fs_dio_end_io(struct bio *bio) -{ - struct f2fs_private_dio *dio = bio->bi_private; - - dec_page_count(F2FS_I_SB(dio->inode), - dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - bio->bi_private = dio->orig_private; - bio->bi_end_io = dio->orig_end_io; - - kfree(dio); - - bio_endio(bio); -} - -static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, - loff_t file_offset) -{ - struct f2fs_private_dio *dio; - bool write = (bio_op(bio) == REQ_OP_WRITE); - - dio = f2fs_kzalloc(F2FS_I_SB(inode), - sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) - goto out; - - dio->inode = inode; - dio->orig_end_io = bio->bi_end_io; - dio->orig_private = bio->bi_private; - dio->write = write; - - bio->bi_end_io = f2fs_dio_end_io; - bio->bi_private = dio; - - inc_page_count(F2FS_I_SB(inode), - write ? F2FS_DIO_WRITE : F2FS_DIO_READ); - - submit_bio(bio); - return; -out: - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); -} - -static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - int rw = iov_iter_rw(iter); - int err; - enum rw_hint hint = iocb->ki_hint; - int whint_mode = F2FS_OPTION(sbi).whint_mode; - bool do_opu; - - err = check_direct_IO(inode, iter, offset); - if (err) - return err < 0 ? err : 0; - - if (f2fs_force_buffered_io(inode, iocb, iter)) - return 0; - - do_opu = rw == WRITE && f2fs_lfs_mode(sbi); - - trace_f2fs_direct_IO_enter(inode, offset, count, rw); - - if (rw == WRITE && whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = WRITE_LIFE_NOT_SET; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (!down_read_trylock(&fi->i_gc_rwsem[rw])) { - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { - up_read(&fi->i_gc_rwsem[rw]); - iocb->ki_hint = hint; - err = -EAGAIN; - goto out; - } - } else { - down_read(&fi->i_gc_rwsem[rw]); - if (do_opu) - down_read(&fi->i_gc_rwsem[READ]); - } - - err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, rw == WRITE ? get_data_block_dio_write : - get_data_block_dio, NULL, f2fs_dio_submit_bio, - rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES : - DIO_SKIP_HOLES); - - if (do_opu) - up_read(&fi->i_gc_rwsem[READ]); - - up_read(&fi->i_gc_rwsem[rw]); - - if (rw == WRITE) { - if (whint_mode == WHINT_MODE_OFF) - iocb->ki_hint = hint; - if (err > 0) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - err); - if (!do_opu) - set_inode_flag(inode, FI_UPDATE_WRITE); - } else if (err == -EIOCBQUEUED) { - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, - count - iov_iter_count(iter)); - } else if (err < 0) { - f2fs_write_failed(inode, offset + count); - } - } else { - if (err > 0) - f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, err); - else if (err == -EIOCBQUEUED) - f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_READ_IO, - count - iov_iter_count(iter)); - } - -out: - trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); - - return err; -} - void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { @@ -3770,12 +3528,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_page_private_gcing(page); - if (test_opt(sbi, COMPRESS_CACHE)) { - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) - clear_page_private_data(page); - } + if (test_opt(sbi, COMPRESS_CACHE) && + inode->i_ino == F2FS_COMPRESS_INO(sbi)) + clear_page_private_data(page); if (page_private_atomic(page)) return f2fs_drop_inmem_page(inode, page); @@ -3795,12 +3550,9 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; if (test_opt(F2FS_P_SB(page), COMPRESS_CACHE)) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct inode *inode = page->mapping->host; - if (f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(sbi, inode->i_ino); - if (inode->i_ino == F2FS_COMPRESS_INO(sbi)) + if (inode->i_ino == F2FS_COMPRESS_INO(F2FS_I_SB(inode))) clear_page_private_data(page); } @@ -4202,7 +3954,7 @@ const struct address_space_operations f2fs_dblock_aops = { .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, - .direct_IO = f2fs_direct_IO, + .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, @@ -4282,3 +4034,58 @@ void f2fs_destroy_bio_entry_cache(void) { kmem_cache_destroy(bio_entry_slab); } + +static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct f2fs_map_blocks map = {}; + pgoff_t next_pgofs = 0; + int err; + + map.m_lblk = bytes_to_blks(inode, offset); + map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_next_pgofs = &next_pgofs; + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + if (flags & IOMAP_WRITE) + map.m_may_create = true; + + err = f2fs_map_blocks(inode, &map, flags & IOMAP_WRITE, + F2FS_GET_BLOCK_DIO); + if (err) + return err; + + iomap->offset = blks_to_bytes(inode, map.m_lblk); + + if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) { + iomap->length = blks_to_bytes(inode, map.m_len); + if (map.m_flags & F2FS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + } else { + iomap->type = IOMAP_UNWRITTEN; + } + if (WARN_ON_ONCE(!__is_valid_data_blkaddr(map.m_pblk))) + return -EINVAL; + + iomap->bdev = map.m_bdev; + iomap->addr = blks_to_bytes(inode, map.m_pblk); + } else { + iomap->length = blks_to_bytes(inode, next_pgofs) - + iomap->offset; + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + } + + if (map.m_flags & F2FS_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + if ((inode->i_state & I_DIRTY_DATASYNC) || + offset + length > i_size_read(inode)) + iomap->flags |= IOMAP_F_DIRTY; + + return 0; +} + +const struct iomap_ops f2fs_iomap_ops = { + .iomap_begin = f2fs_iomap_begin, +}; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0d603187171..eb22fa91c2b2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -58,6 +58,7 @@ enum { FAULT_WRITE_IO, FAULT_SLAB_ALLOC, FAULT_DQUOT_INIT, + FAULT_LOCK_OP, FAULT_MAX, }; @@ -656,6 +657,7 @@ enum { #define FADVISE_KEEP_SIZE_BIT 0x10 #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 +#define FADVISE_TRUNC_BIT 0x80 #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) @@ -683,6 +685,10 @@ enum { #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) +#define file_should_truncate(inode) is_file(inode, FADVISE_TRUNC_BIT) +#define file_need_truncate(inode) set_file(inode, FADVISE_TRUNC_BIT) +#define file_dont_truncate(inode) clear_file(inode, FADVISE_TRUNC_BIT) + #define DEF_DIR_LEVEL 0 enum { @@ -717,7 +723,7 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ - FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */ FI_HOT_DATA, /* indicate file is hot */ FI_EXTRA_ATTR, /* indicate file has extra attribute */ FI_PROJ_INHERIT, /* indicate file inherits projectid */ @@ -1020,6 +1026,7 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ + unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ @@ -1488,6 +1495,7 @@ struct compress_ctx { unsigned int nr_rpages; /* total page number in rpages */ struct page **cpages; /* pages store compressed data in cluster */ unsigned int nr_cpages; /* total page number in cpages */ + unsigned int valid_nr_cpages; /* valid page number in cpages */ void *rbuf; /* virtual mapped address on rpages */ struct compress_data *cbuf; /* virtual mapped address on cpages */ size_t rlen; /* valid data length in rbuf */ @@ -1679,6 +1687,9 @@ struct f2fs_sb_info { unsigned int cur_victim_sec; /* current victim section num */ unsigned int gc_mode; /* current GC state */ unsigned int next_victim_seg[2]; /* next segment in victim section */ + spinlock_t gc_urgent_high_lock; + bool gc_urgent_high_limited; /* indicates having limited trial count */ + unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */ /* for skip statistic */ unsigned int atomic_files; /* # of opened atomic file */ @@ -1803,13 +1814,6 @@ struct f2fs_sb_info { #endif }; -struct f2fs_private_dio { - struct inode *inode; - void *orig_private; - bio_end_io_t *orig_end_io; - bool write; -}; - #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(sbi, type) \ printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n", \ @@ -2095,6 +2099,10 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) { + if (time_to_inject(sbi, FAULT_LOCK_OP)) { + f2fs_show_injection_info(sbi, FAULT_LOCK_OP); + return 0; + } return down_read_trylock(&sbi->cp_rwsem); } @@ -2200,6 +2208,11 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, true)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + avail_user_block_count -= sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (avail_user_block_count > sbi->unusable_block_count) avail_user_block_count -= sbi->unusable_block_count; @@ -2446,6 +2459,11 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, if (!__allow_reserved_blocks(sbi, inode, false)) valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks; + + if (F2FS_IO_ALIGNED(sbi)) + valid_block_count += sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments; + user_block_count = sbi->user_block_count; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) user_block_count -= sbi->unusable_block_count; @@ -3118,12 +3136,16 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { + if (is_file(inode, type)) + return; F2FS_I(inode)->i_advise |= type; f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { + if (!is_file(inode, type)) + return; F2FS_I(inode)->i_advise &= ~type; f2fs_mark_inode_dirty_sync(inode, true); } @@ -3408,7 +3430,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni); + struct node_info *ni, bool checkpoint_context); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); @@ -3616,7 +3638,6 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); -int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, int op_flags, bool for_write); @@ -3639,6 +3660,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, struct writeback_control *wbc, enum iostat_type io_type, int compr_blocks, bool allow_balance); +void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -3652,6 +3674,7 @@ int f2fs_init_post_read_processing(void); void f2fs_destroy_post_read_processing(void); int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); +extern const struct iomap_ops f2fs_iomap_ops; /* * gc.c diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 92ec2699bc85..3c98ef6af97d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -24,6 +24,7 @@ #include <linux/sched/signal.h> #include <linux/fileattr.h> #include <linux/fadvise.h> +#include <linux/iomap.h> #include "f2fs.h" #include "node.h" @@ -1232,7 +1233,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - ret = f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (ret) { f2fs_put_dnode(&dn); return ret; @@ -1687,6 +1688,7 @@ next_alloc: map.m_seg_type = CURSEG_COLD_DATA_PINNED; err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + file_dont_truncate(inode); up_write(&sbi->pin_sem); @@ -1748,7 +1750,11 @@ static long f2fs_fallocate(struct file *file, int mode, (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; - if (f2fs_compressed_file(inode) && + /* + * Pinned file should not support partial trucation since the block + * can be used by applications. + */ + if ((f2fs_compressed_file(inode) || f2fs_is_pinned_file(inode)) && (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE))) return -EOPNOTSUPP; @@ -3143,17 +3149,17 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) inode_lock(inode); - if (f2fs_should_update_outplace(inode, NULL)) { - ret = -EINVAL; - goto out; - } - if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); f2fs_i_gc_failures_write(inode, 0); goto done; } + if (f2fs_should_update_outplace(inode, NULL)) { + ret = -EINVAL; + goto out; + } + if (f2fs_pin_file_control(inode, false)) { ret = -EAGAIN; goto out; @@ -4218,27 +4224,385 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return __f2fs_ioctl(filp, cmd, arg); } -static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +/* + * Return %true if the given read or write request should use direct I/O, or + * %false if it should use buffered I/O. + */ +static bool f2fs_should_use_dio(struct inode *inode, struct kiocb *iocb, + struct iov_iter *iter) +{ + unsigned int align; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return false; + + if (f2fs_force_buffered_io(inode, iocb, iter)) + return false; + + /* + * Direct I/O not aligned to the disk's logical_block_size will be + * attempted, but will fail with -EINVAL. + * + * f2fs additionally requires that direct I/O be aligned to the + * filesystem block size, which is often a stricter requirement. + * However, f2fs traditionally falls back to buffered I/O on requests + * that are logical_block_size-aligned but not fs-block aligned. + * + * The below logic implements this behavior. + */ + align = iocb->ki_pos | iov_iter_alignment(iter); + if (!IS_ALIGNED(align, i_blocksize(inode)) && + IS_ALIGNED(align, bdev_logical_block_size(inode->i_sb->s_bdev))) + return false; + + return true; +} + +static int f2fs_dio_read_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_READ); + if (error) + return error; + f2fs_update_iostat(sbi, APP_DIRECT_READ_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = { + .end_io = f2fs_dio_read_end_io, +}; + +static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - int ret; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(to); + struct iomap_dio *dio; + ssize_t ret; + + if (count == 0) + return 0; /* skip atime update */ + + trace_f2fs_direct_IO_enter(inode, iocb, count, READ); + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!down_read_trylock(&fi->i_gc_rwsem[READ])) { + ret = -EAGAIN; + goto out; + } + } else { + down_read(&fi->i_gc_rwsem[READ]); + } + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_READ counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_READ); + dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops, + &f2fs_iomap_dio_read_ops, 0, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_READ); + } else { + ret = iomap_dio_complete(dio); + } + + up_read(&fi->i_gc_rwsem[READ]); + + file_accessed(file); +out: + trace_f2fs_direct_IO_exit(inode, pos, count, READ, ret); + return ret; +} + +static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; if (!f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - ret = generic_file_read_iter(iocb, iter); + if (f2fs_should_use_dio(inode, iocb, to)) + return f2fs_dio_read_iter(iocb, to); + ret = filemap_read(iocb, to, 0); if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_READ_IO, ret); + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_READ_IO, ret); + return ret; +} + +static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t count; + int err; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + return -EPERM; + + count = generic_write_checks(iocb, from); + if (count <= 0) + return count; + + err = file_modified(file); + if (err) + return err; + return count; +} + +/* + * Preallocate blocks for a write request, if it is possible and helpful to do + * so. Returns a positive number if blocks may have been preallocated, 0 if no + * blocks were preallocated, or a negative errno value if something went + * seriously wrong. Also sets FI_PREALLOCATED_ALL on the inode if *all* the + * requested blocks (not just some of them) have been allocated. + */ +static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, + bool dio) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const loff_t pos = iocb->ki_pos; + const size_t count = iov_iter_count(iter); + struct f2fs_map_blocks map = {}; + int flag; + int ret; + + /* If it will be an out-of-place direct write, don't bother. */ + if (dio && f2fs_lfs_mode(sbi)) + return 0; + /* + * Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into + * buffered IO, if DIO meets any holes. + */ + if (dio && i_size_read(inode) && + (F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode)))) + return 0; + + /* No-wait I/O can't allocate blocks. */ + if (iocb->ki_flags & IOCB_NOWAIT) + return 0; + + /* If it will be a short write, don't bother. */ + if (fault_in_iov_iter_readable(iter, count)) + return 0; + + if (f2fs_has_inline_data(inode)) { + /* If the data will fit inline, don't bother. */ + if (pos + count <= MAX_INLINE_DATA(inode)) + return 0; + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + /* Do not preallocate blocks that will be written partially in 4KB. */ + map.m_lblk = F2FS_BLK_ALIGN(pos); + map.m_len = F2FS_BYTES_TO_BLK(pos + count); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; + map.m_may_create = true; + if (dio) { + map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint); + flag = F2FS_GET_BLOCK_PRE_DIO; + } else { + map.m_seg_type = NO_CHECK_TYPE; + flag = F2FS_GET_BLOCK_PRE_AIO; + } + + ret = f2fs_map_blocks(inode, &map, 1, flag); + /* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */ + if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0)) + return ret; + if (ret == 0) + set_inode_flag(inode, FI_PREALLOCATED_ALL); + return map.m_len; +} + +static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + ssize_t ret; + if (iocb->ki_flags & IOCB_NOWAIT) + return -EOPNOTSUPP; + + current->backing_dev_info = inode_to_bdi(inode); + ret = generic_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + + if (ret > 0) { + iocb->ki_pos += ret; + f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret); + } return ret; } -static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, + unsigned int flags) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(iocb->ki_filp)); + + dec_page_count(sbi, F2FS_DIO_WRITE); + if (error) + return error; + f2fs_update_iostat(sbi, APP_DIRECT_IO, size); + return 0; +} + +static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = { + .end_io = f2fs_dio_write_end_io, +}; + +static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from, + bool *may_need_sync) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + const bool do_opu = f2fs_lfs_mode(sbi); + const int whint_mode = F2FS_OPTION(sbi).whint_mode; + const loff_t pos = iocb->ki_pos; + const ssize_t count = iov_iter_count(from); + const enum rw_hint hint = iocb->ki_hint; + unsigned int dio_flags; + struct iomap_dio *dio; + ssize_t ret; + + trace_f2fs_direct_IO_enter(inode, iocb, count, WRITE); + + if (iocb->ki_flags & IOCB_NOWAIT) { + /* f2fs_convert_inline_inode() and block allocation can block */ + if (f2fs_has_inline_data(inode) || + !f2fs_overwrite_io(inode, pos, count)) { + ret = -EAGAIN; + goto out; + } + + if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) { + ret = -EAGAIN; + goto out; + } + if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) { + up_read(&fi->i_gc_rwsem[WRITE]); + ret = -EAGAIN; + goto out; + } + } else { + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + down_read(&fi->i_gc_rwsem[WRITE]); + if (do_opu) + down_read(&fi->i_gc_rwsem[READ]); + } + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = WRITE_LIFE_NOT_SET; + + /* + * We have to use __iomap_dio_rw() and iomap_dio_complete() instead of + * the higher-level function iomap_dio_rw() in order to ensure that the + * F2FS_DIO_WRITE counter will be decremented correctly in all cases. + */ + inc_page_count(sbi, F2FS_DIO_WRITE); + dio_flags = 0; + if (pos + count > inode->i_size) + dio_flags |= IOMAP_DIO_FORCE_WAIT; + dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops, + &f2fs_iomap_dio_write_ops, dio_flags, 0); + if (IS_ERR_OR_NULL(dio)) { + ret = PTR_ERR_OR_ZERO(dio); + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EIOCBQUEUED) + dec_page_count(sbi, F2FS_DIO_WRITE); + } else { + ret = iomap_dio_complete(dio); + } + + if (whint_mode == WHINT_MODE_OFF) + iocb->ki_hint = hint; + if (do_opu) + up_read(&fi->i_gc_rwsem[READ]); + up_read(&fi->i_gc_rwsem[WRITE]); + + if (ret < 0) + goto out; + if (pos + ret > inode->i_size) + f2fs_i_size_write(inode, pos + ret); + if (!do_opu) + set_inode_flag(inode, FI_UPDATE_WRITE); + + if (iov_iter_count(from)) { + ssize_t ret2; + loff_t bufio_start_pos = iocb->ki_pos; + + /* + * The direct write was partial, so we need to fall back to a + * buffered write for the remainder. + */ + + ret2 = f2fs_buffered_write_iter(iocb, from); + if (iov_iter_count(from)) + f2fs_write_failed(inode, iocb->ki_pos); + if (ret2 < 0) + goto out; + + /* + * Ensure that the pagecache pages are written to disk and + * invalidated to preserve the expected O_DIRECT semantics. + */ + if (ret2 > 0) { + loff_t bufio_end_pos = bufio_start_pos + ret2 - 1; + + ret += ret2; + + ret2 = filemap_write_and_wait_range(file->f_mapping, + bufio_start_pos, + bufio_end_pos); + if (ret2 < 0) + goto out; + invalidate_mapping_pages(file->f_mapping, + bufio_start_pos >> PAGE_SHIFT, + bufio_end_pos >> PAGE_SHIFT); + } + } else { + /* iomap_dio_rw() already handled the generic_write_sync(). */ + *may_need_sync = false; + } +out: + trace_f2fs_direct_IO_exit(inode, pos, count, WRITE, ret); + return ret; +} + +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + const loff_t orig_pos = iocb->ki_pos; + const size_t orig_count = iov_iter_count(from); + loff_t target_size; + bool dio; + bool may_need_sync = true; + int preallocated; ssize_t ret; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { @@ -4260,91 +4624,42 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); } - if (unlikely(IS_IMMUTABLE(inode))) { - ret = -EPERM; - goto unlock; - } - - if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) { - ret = -EPERM; - goto unlock; - } - - ret = generic_write_checks(iocb, from); - if (ret > 0) { - bool preallocated = false; - size_t target_size = 0; - int err; - - if (fault_in_iov_iter_readable(from, iov_iter_count(from))) - set_inode_flag(inode, FI_NO_PREALLOC); - - if ((iocb->ki_flags & IOCB_NOWAIT)) { - if (!f2fs_overwrite_io(inode, iocb->ki_pos, - iov_iter_count(from)) || - f2fs_has_inline_data(inode) || - f2fs_force_buffered_io(inode, iocb, from)) { - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = -EAGAIN; - goto out; - } - goto write; - } - - if (is_inode_flag_set(inode, FI_NO_PREALLOC)) - goto write; - - if (iocb->ki_flags & IOCB_DIRECT) { - /* - * Convert inline data for Direct I/O before entering - * f2fs_direct_IO(). - */ - err = f2fs_convert_inline_inode(inode); - if (err) - goto out_err; - /* - * If force_buffere_io() is true, we have to allocate - * blocks all the time, since f2fs_direct_IO will fall - * back to buffered IO. - */ - if (!f2fs_force_buffered_io(inode, iocb, from) && - f2fs_lfs_mode(F2FS_I_SB(inode))) - goto write; - } - preallocated = true; - target_size = iocb->ki_pos + iov_iter_count(from); + ret = f2fs_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; - err = f2fs_preallocate_blocks(iocb, from); - if (err) { -out_err: - clear_inode_flag(inode, FI_NO_PREALLOC); - inode_unlock(inode); - ret = err; - goto out; - } -write: - ret = __generic_file_write_iter(iocb, from); - clear_inode_flag(inode, FI_NO_PREALLOC); + /* Determine whether we will do a direct write or a buffered write. */ + dio = f2fs_should_use_dio(inode, iocb, from); - /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) { - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - filemap_invalidate_lock(inode->i_mapping); - f2fs_truncate(inode); - filemap_invalidate_unlock(inode->i_mapping); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - } + /* Possibly preallocate the blocks for the write. */ + target_size = iocb->ki_pos + iov_iter_count(from); + preallocated = f2fs_preallocate_blocks(iocb, from, dio); + if (preallocated < 0) + ret = preallocated; + else + /* Do the actual write. */ + ret = dio ? + f2fs_dio_write_iter(iocb, from, &may_need_sync): + f2fs_buffered_write_iter(iocb, from); - if (ret > 0) - f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); + /* Don't leave any preallocated blocks around past i_size. */ + if (preallocated && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + filemap_invalidate_lock(inode->i_mapping); + if (!f2fs_truncate(inode)) + file_dont_truncate(inode); + filemap_invalidate_unlock(inode->i_mapping); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } else { + file_dont_truncate(inode); } -unlock: + + clear_inode_flag(inode, FI_PREALLOCATED_ALL); +out_unlock: inode_unlock(inode); out: - trace_f2fs_file_write_iter(inode, iocb->ki_pos, - iov_iter_count(from), ret); - if (ret > 0) + trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret); + if (ret > 0 && may_need_sync) ret = generic_write_sync(iocb, ret); return ret; } @@ -4352,12 +4667,12 @@ out: static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { - struct inode *inode; struct address_space *mapping; struct backing_dev_info *bdi; + struct inode *inode = file_inode(filp); + int err; if (advice == POSIX_FADV_SEQUENTIAL) { - inode = file_inode(filp); if (S_ISFIFO(inode->i_mode)) return -ESPIPE; @@ -4374,7 +4689,13 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, return 0; } - return generic_fadvise(filp, offset, len, advice); + err = generic_fadvise(filp, offset, len, advice); + if (!err && advice == POSIX_FADV_DONTNEED && + test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode)) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + + return err; } #ifdef CONFIG_COMPAT diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 374bbb5294d9..ee308a8de432 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -92,6 +92,18 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (sbi->gc_mode == GC_URGENT_HIGH) { + spin_lock(&sbi->gc_urgent_high_lock); + if (sbi->gc_urgent_high_limited) { + if (!sbi->gc_urgent_high_remaining) { + sbi->gc_urgent_high_limited = false; + spin_unlock(&sbi->gc_urgent_high_lock); + sbi->gc_mode = GC_NORMAL; + continue; + } + sbi->gc_urgent_high_remaining--; + } + spin_unlock(&sbi->gc_urgent_high_lock); + wait_ms = gc_th->urgent_sleep_time; down_write(&sbi->gc_lock); goto do_gc; @@ -947,7 +959,7 @@ next_step: continue; } - if (f2fs_get_node_info(sbi, nid, &ni)) { + if (f2fs_get_node_info(sbi, nid, &ni, false)) { f2fs_put_page(node_page, 1); continue; } @@ -1015,7 +1027,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - if (f2fs_get_node_info(sbi, nid, dni)) { + if (f2fs_get_node_info(sbi, nid, dni, false)) { f2fs_put_page(node_page, 1); return false; } @@ -1026,6 +1038,9 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, set_sbi_flag(sbi, SBI_NEED_FSCK); } + if (f2fs_check_nid_range(sbi, dni->ino)) + return false; + *nofs = ofs_of_node(node_page); source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); @@ -1039,7 +1054,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) { f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u", blkaddr, source_blkaddr, segno); - f2fs_bug_on(sbi, 1); + set_sbi_flag(sbi, SBI_NEED_FSCK); } } #endif @@ -1206,7 +1221,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false); if (err) goto put_out; @@ -1456,7 +1471,8 @@ next_step: if (phase == 3) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode) || is_bad_inode(inode)) + if (IS_ERR(inode) || is_bad_inode(inode) || + special_file(inode->i_mode)) continue; if (!down_write_trylock( diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ea08f0dfa1bd..4b5cefa3f90c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -131,7 +131,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni, false); if (err) { f2fs_truncate_data_blocks_range(dn, 1); f2fs_put_dnode(dn); @@ -786,7 +786,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni, false); if (err) goto out; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 4c11254a07d4..0ec8e32a00b4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -516,6 +516,11 @@ make_now: } else if (ino == F2FS_COMPRESS_INO(sbi)) { #ifdef CONFIG_F2FS_FS_COMPRESSION inode->i_mapping->a_ops = &f2fs_compress_aops; + /* + * generic_error_remove_page only truncates pages of regular + * inode + */ + inode->i_mode |= S_IFREG; #endif mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); @@ -544,6 +549,14 @@ make_now: goto bad_inode; } f2fs_set_inode_flags(inode); + + if (file_should_truncate(inode)) { + ret = f2fs_truncate(inode); + if (ret) + goto bad_inode; + file_dont_truncate(inode); + } + unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -738,7 +751,8 @@ void f2fs_evict_inode(struct inode *inode) trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); - if (test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) + if ((inode->i_nlink || is_bad_inode(inode)) && + test_opt(sbi, COMPRESS_CACHE) && f2fs_compressed_file(inode)) f2fs_invalidate_compress_pages(sbi, inode->i_ino); if (inode->i_ino == F2FS_NODE_INO(sbi) || @@ -868,7 +882,7 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "May loss orphan inode, run fsck to fix."); diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c index cdcf54ae0db8..be599f31d3c4 100644 --- a/fs/f2fs/iostat.c +++ b/fs/f2fs/iostat.c @@ -92,7 +92,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; - spin_lock_irq(&sbi->iostat_lat_lock); + spin_lock_bh(&sbi->iostat_lat_lock); for (idx = 0; idx < MAX_IO_TYPE; idx++) { for (io = 0; io < NR_PAGE_TYPE; io++) { cnt = io_lat->bio_cnt[idx][io]; @@ -106,7 +106,7 @@ static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) io_lat->bio_cnt[idx][io] = 0; } } - spin_unlock_irq(&sbi->iostat_lat_lock); + spin_unlock_bh(&sbi->iostat_lat_lock); trace_f2fs_iostat_latency(sbi, iostat_lat); } @@ -120,9 +120,9 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) return; /* Need double check under the lock */ - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); return; } sbi->iostat_next_period = jiffies + @@ -133,7 +133,7 @@ static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) sbi->prev_rw_iostat[i]; sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); trace_f2fs_iostat(sbi, iostat_diff); @@ -145,16 +145,16 @@ void f2fs_reset_iostat(struct f2fs_sb_info *sbi) struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int i; - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); for (i = 0; i < NR_IO_TYPE; i++) { sbi->rw_iostat[i] = 0; sbi->prev_rw_iostat[i] = 0; } - spin_unlock(&sbi->iostat_lock); + spin_unlock_bh(&sbi->iostat_lock); - spin_lock_irq(&sbi->iostat_lat_lock); + spin_lock_bh(&sbi->iostat_lat_lock); memset(io_lat, 0, sizeof(struct iostat_lat_info)); - spin_unlock_irq(&sbi->iostat_lat_lock); + spin_unlock_bh(&sbi->iostat_lat_lock); } void f2fs_update_iostat(struct f2fs_sb_info *sbi, @@ -163,19 +163,16 @@ void f2fs_update_iostat(struct f2fs_sb_info *sbi, if (!sbi->iostat_enable) return; - spin_lock(&sbi->iostat_lock); + spin_lock_bh(&sbi->iostat_lock); sbi->rw_iostat[type] += io_bytes; - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_BUFFERED_IO] = - sbi->rw_iostat[APP_WRITE_IO] - - sbi->rw_iostat[APP_DIRECT_IO]; + if (type == APP_BUFFERED_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_WRITE_IO] += io_bytes; - if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_BUFFERED_READ_IO] = - sbi->rw_iostat[APP_READ_IO] - - sbi->rw_iostat[APP_DIRECT_READ_IO]; - spin_unlock(&sbi->iostat_lock); + if (type == APP_BUFFERED_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_READ_IO] += io_bytes; + + spin_unlock_bh(&sbi->iostat_lock); f2fs_record_iostat(sbi); } @@ -185,7 +182,6 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, { unsigned long ts_diff; unsigned int iotype = iostat_ctx->type; - unsigned long flags; struct f2fs_sb_info *sbi = iostat_ctx->sbi; struct iostat_lat_info *io_lat = sbi->iostat_io_lat; int idx; @@ -206,12 +202,12 @@ static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, idx = WRITE_ASYNC_IO; } - spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + spin_lock_bh(&sbi->iostat_lat_lock); io_lat->sum_lat[idx][iotype] += ts_diff; io_lat->bio_cnt[idx][iotype]++; if (ts_diff > io_lat->peak_lat[idx][iotype]) io_lat->peak_lat[idx][iotype] = ts_diff; - spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); + spin_unlock_bh(&sbi->iostat_lat_lock); } void iostat_update_and_unbind_ctx(struct bio *bio, int rw) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 219506ca9a97..50b2874e758c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -430,6 +430,10 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; + /* Let's mitigate lock contention of nat_tree_lock during checkpoint */ + if (rwsem_is_locked(&sbi->cp_global_sem)) + return; + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; @@ -539,7 +543,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, - struct node_info *ni) + struct node_info *ni, bool checkpoint_context) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -572,9 +576,10 @@ retry: * nat_tree_lock. Therefore, we should retry, if we failed to grab here * while not bothering checkpoint. */ - if (!rwsem_is_locked(&sbi->cp_global_sem)) { + if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) { down_read(&curseg->journal_rwsem); - } else if (!down_read_trylock(&curseg->journal_rwsem)) { + } else if (rwsem_is_contended(&nm_i->nat_tree_lock) || + !down_read_trylock(&curseg->journal_rwsem)) { up_read(&nm_i->nat_tree_lock); goto retry; } @@ -887,7 +892,7 @@ static int truncate_node(struct dnode_of_data *dn) int err; pgoff_t index; - err = f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni, false); if (err) return err; @@ -1286,7 +1291,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni, false); if (err) { dec_valid_node_count(sbi, dn->inode, !ofs); goto fail; @@ -1348,7 +1353,7 @@ static int read_node_page(struct page *page, int op_flags) return LOCKED_PAGE; } - err = f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni, false); if (err) return err; @@ -1600,7 +1605,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); - if (f2fs_get_node_info(sbi, nid, &ni)) + if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; if (wbc->for_reclaim) { @@ -2701,7 +2706,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - err = f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni, false); if (err) return err; @@ -2741,7 +2746,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct page *ipage; int err; - err = f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni, false); if (err) return err; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d1664a0567ef..9683c80ff8c2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -596,7 +596,7 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 40fdb4a8daeb..575d3dc418d0 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -253,7 +253,7 @@ retry: goto next; } - err = f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 46fde9f3f28e..0291cd55cf09 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -538,7 +538,8 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments; + return SM_I(sbi)->reserved_segments + + SM_I(sbi)->additional_reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 15f12ece0ac6..76e6a3df9aba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -59,6 +59,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", [FAULT_DQUOT_INIT] = "dquot initialize", + [FAULT_LOCK_OP] = "lock_op", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -321,6 +322,46 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).s_resgid)); } +static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi) +{ + unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec; + unsigned int avg_vblocks; + unsigned int wanted_reserved_segments; + block_t avail_user_block_count; + + if (!F2FS_IO_ALIGNED(sbi)) + return 0; + + /* average valid block count in section in worst case */ + avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi); + + /* + * we need enough free space when migrating one section in worst case + */ + wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) * + reserved_segments(sbi); + wanted_reserved_segments -= reserved_segments(sbi); + + avail_user_block_count = sbi->user_block_count - + sbi->current_reserved_blocks - + F2FS_OPTION(sbi).root_reserved_blocks; + + if (wanted_reserved_segments * sbi->blocks_per_seg > + avail_user_block_count) { + f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u", + wanted_reserved_segments, + avail_user_block_count >> sbi->log_blocks_per_seg); + return -ENOSPC; + } + + SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments; + + f2fs_info(sbi, "IO align feature needs additional reserved segment: %u", + wanted_reserved_segments); + + return 0; +} + static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi) { if (!F2FS_OPTION(sbi).unusable_cap_perc) @@ -3540,6 +3581,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; + spin_lock_init(&sbi->gc_urgent_high_lock); sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -4179,6 +4221,10 @@ try_onemore: goto free_nm; } + err = adjust_reserved_segment(sbi); + if (err) + goto free_nm; + /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 8408f77764e8..df406c16b2eb 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -118,6 +118,15 @@ static ssize_t sb_status_show(struct f2fs_attr *a, return sprintf(buf, "%lx\n", sbi->s_flag); } +static ssize_t pending_discard_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + if (!SM_I(sbi)->dcc_info) + return -EINVAL; + return sprintf(buf, "%llu\n", (unsigned long long)atomic_read( + &SM_I(sbi)->dcc_info->discard_cmd_cnt)); +} + static ssize_t features_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -414,7 +423,9 @@ out: if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks)) { + F2FS_OPTION(sbi).root_reserved_blocks - + sbi->blocks_per_seg * + SM_I(sbi)->additional_reserved_segments)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } @@ -477,6 +488,15 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) { + spin_lock(&sbi->gc_urgent_high_lock); + sbi->gc_urgent_high_limited = t != 0; + sbi->gc_urgent_high_remaining = t; + spin_unlock(&sbi->gc_urgent_high_lock); + + return count; + } + #ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; @@ -732,6 +752,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); @@ -743,6 +764,7 @@ F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); +F2FS_GENERAL_RO_ATTR(pending_discard); #ifdef CONFIG_F2FS_STAT_FS F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count); F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count); @@ -811,6 +833,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(main_blkaddr), ATTR_LIST(max_small_discards), ATTR_LIST(discard_granularity), + ATTR_LIST(pending_discard), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), @@ -843,6 +866,7 @@ static struct attribute *f2fs_attrs[] = { #endif ATTR_LIST(data_io_flag), ATTR_LIST(node_io_flag), + ATTR_LIST(gc_urgent_high_remaining), ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index e348f33bcb2b..8e5cd9c916ff 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -226,15 +226,18 @@ static inline const struct xattr_handler *f2fs_xattr_handler(int index) } static struct f2fs_xattr_entry *__find_xattr(void *base_addr, - void *last_base_addr, int index, - size_t len, const char *name) + void *last_base_addr, void **last_addr, + int index, size_t len, const char *name) { struct f2fs_xattr_entry *entry; list_for_each_xattr(entry, base_addr) { if ((void *)(entry) + sizeof(__u32) > last_base_addr || - (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) + (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) { + if (last_addr) + *last_addr = entry; return NULL; + } if (entry->e_name_index != index) continue; @@ -254,19 +257,9 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, unsigned int inline_size = inline_xattr_size(inode); void *max_addr = base_addr + inline_size; - list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > max_addr || - (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { - *last_addr = entry; - return NULL; - } - if (entry->e_name_index != index) - continue; - if (entry->e_name_len != len) - continue; - if (!memcmp(entry->e_name, name, len)) - break; - } + entry = __find_xattr(base_addr, max_addr, last_addr, index, len, name); + if (!entry) + return NULL; /* inline xattr header or entry across max inline xattr size */ if (IS_XATTR_LAST_ENTRY(entry) && @@ -368,7 +361,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, else cur_addr = txattr_addr; - *xe = __find_xattr(cur_addr, last_txattr_addr, index, len, name); + *xe = __find_xattr(cur_addr, last_txattr_addr, NULL, index, len, name); if (!*xe) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); @@ -659,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, last_base_addr = (void *)base_addr + XATTR_SIZE(inode); /* find entry with wanted name. */ - here = __find_xattr(base_addr, last_base_addr, index, len, name); + here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name); if (!here) { f2fs_err(F2FS_I_SB(inode), "inode (%lu) has corrupted xattr", inode->i_ino); @@ -684,8 +677,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, } last = here; - while (!IS_XATTR_LAST_ENTRY(last)) + while (!IS_XATTR_LAST_ENTRY(last)) { + if ((void *)(last) + sizeof(__u32) > last_base_addr || + (void *)XATTR_NEXT_ENTRY(last) > last_base_addr) { + f2fs_err(F2FS_I_SB(inode), "inode (%lu) has invalid last xattr entry, entry_size: %zu", + inode->i_ino, ENTRY_SIZE(last)); + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + error = -EFSCORRUPTED; + goto exit; + } last = XATTR_NEXT_ENTRY(last); + } newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size); diff --git a/fs/fat/file.c b/fs/fat/file.c index 13855ba49cd9..a5a309fcc7fa 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -175,9 +175,10 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && - MSDOS_SB(inode->i_sb)->options.flush) { + MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); - congestion_wait(BLK_RW_ASYNC, HZ/10); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(HZ/10); } return 0; } diff --git a/fs/fs_context.c b/fs/fs_context.c index b7e43a780a62..24ce12f0db32 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -548,7 +548,7 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) param->key); } - if (len > PAGE_SIZE - 2 - size) + if (size + len + 2 > PAGE_SIZE) return invalf(fc, "VFS: Legacy: Cumulative options too large"); if (strchr(param->key, ',') || (param->type == fs_value_is_string && diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 74e627109b79..9d737904d07c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -891,7 +891,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) return 0; out_vqs: - vdev->config->reset(vdev); + virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev, fs); kfree(fs->vqs); @@ -923,7 +923,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) list_del_init(&fs->list); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); - vdev->config->reset(vdev); + virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev, fs); vdev->priv = NULL; diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 456e87aec7fd..68b4240c6191 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -260,8 +260,10 @@ struct hfsplus_cat_folder { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct DInfo user_info; - struct DXInfo finder_info; + struct_group_attr(info, __packed, + struct DInfo user_info; + struct DXInfo finder_info; + ); __be32 text_encoding; __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ } __packed; @@ -294,8 +296,10 @@ struct hfsplus_cat_file { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct FInfo user_info; - struct FXInfo finder_info; + struct_group_attr(info, __packed, + struct FInfo user_info; + struct FXInfo finder_info; + ); __be32 text_encoding; u32 reserved2; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e2855ceefd39..49891b12c415 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -296,7 +296,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, sizeof(hfsplus_cat_entry)); if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { if (size == folder_finderinfo_len) { - memcpy(&entry.folder.user_info, value, + memcpy(&entry.folder.info, value, folder_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, @@ -309,7 +309,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, } } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { if (size == file_finderinfo_len) { - memcpy(&entry.file.user_info, value, + memcpy(&entry.file.info, value, file_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, diff --git a/fs/io-wq.c b/fs/io-wq.c index a7763127f884..bb7f161bb19c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -48,7 +48,8 @@ struct io_worker { struct io_wqe *wqe; struct io_wq_work *cur_work; - spinlock_t lock; + struct io_wq_work *next_work; + raw_spinlock_t lock; struct completion ref_done; @@ -405,8 +406,7 @@ static void io_wqe_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, - struct io_wq_work *work) +static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) __must_hold(wqe->lock) { if (worker->flags & IO_WORKER_F_FREE) { @@ -529,9 +529,10 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } - spin_lock(&worker->lock); + raw_spin_lock(&worker->lock); worker->cur_work = work; - spin_unlock(&worker->lock); + worker->next_work = NULL; + raw_spin_unlock(&worker->lock); } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); @@ -546,7 +547,7 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; -get_next: + /* * If we got some work, mark us as busy. If we didn't, but * the list isn't empty, it means we stalled on hashed work. @@ -555,9 +556,20 @@ get_next: * clear the stalled flag. */ work = io_get_next_work(acct, worker); - if (work) - __io_worker_busy(wqe, worker, work); - + if (work) { + __io_worker_busy(wqe, worker); + + /* + * Make sure cancelation can find this, even before + * it becomes the active work. That avoids a window + * where the work has been removed from our general + * work list, but isn't yet discoverable as the + * current work item for this worker. + */ + raw_spin_lock(&worker->lock); + worker->next_work = work; + raw_spin_unlock(&worker->lock); + } raw_spin_unlock(&wqe->lock); if (!work) break; @@ -594,11 +606,6 @@ get_next: spin_unlock_irq(&wq->hash->wait.lock); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); - raw_spin_lock(&wqe->lock); - /* skip unnecessary unlock-lock wqe->lock */ - if (!work) - goto get_next; - raw_spin_unlock(&wqe->lock); } } while (work); @@ -815,7 +822,7 @@ fail: refcount_set(&worker->ref, 1); worker->wqe = wqe; - spin_lock_init(&worker->lock); + raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) @@ -973,6 +980,19 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); } +static bool __io_wq_worker_cancel(struct io_worker *worker, + struct io_cb_cancel_data *match, + struct io_wq_work *work) +{ + if (work && match->fn(work, match->data)) { + work->flags |= IO_WQ_WORK_CANCEL; + set_notify_signal(worker->task); + return true; + } + + return false; +} + static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; @@ -981,13 +1001,11 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * Hold the lock to avoid ->cur_work going out of scope, caller * may dereference the passed in work. */ - spin_lock(&worker->lock); - if (worker->cur_work && - match->fn(worker->cur_work, match->data)) { - set_notify_signal(worker->task); + raw_spin_lock(&worker->lock); + if (__io_wq_worker_cancel(worker, match, worker->cur_work) || + __io_wq_worker_cancel(worker, match, worker->next_work)) match->nr_running++; - } - spin_unlock(&worker->lock); + raw_spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; } @@ -1039,17 +1057,16 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, { int i; retry: - raw_spin_lock(&wqe->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); if (io_acct_cancel_pending_work(wqe, acct, match)) { + raw_spin_lock(&wqe->lock); if (match->cancel_all) goto retry; - return; + break; } } - raw_spin_unlock(&wqe->lock); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -1074,25 +1091,27 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * First check pending list, if we're lucky we can just remove it * from there. CANCEL_OK means that the work is returned as-new, * no completion will be posted for it. - */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; - } - - /* - * Now check if a free (going busy) or busy worker has the work + * + * Then check if a free (going busy) or busy worker has the work * currently running. If we find it there, we'll return CANCEL_RUNNING * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. + * + * Do both of these while holding the wqe->lock, to ensure that + * we'll find a work item regardless of state. */ for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; + raw_spin_lock(&wqe->lock); + io_wqe_cancel_pending_work(wqe, &match); + if (match.nr_pending && !match.cancel_all) { + raw_spin_unlock(&wqe->lock); + return IO_WQ_CANCEL_OK; + } + io_wqe_cancel_running_work(wqe, &match); + raw_spin_unlock(&wqe->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; } @@ -1263,7 +1282,9 @@ static void io_wq_destroy(struct io_wq *wq) .fn = io_wq_work_match_all, .cancel_all = true, }; + raw_spin_lock(&wqe->lock); io_wqe_cancel_pending_work(wqe, &match); + raw_spin_unlock(&wqe->lock); free_cpumask_var(wqe->cpu_mask); kfree(wqe); } diff --git a/fs/io_uring.c b/fs/io_uring.c index de9c9de90655..e54c4127422e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1192,12 +1192,6 @@ static inline bool req_ref_put_and_test(struct io_kiocb *req) return atomic_dec_and_test(&req->refs); } -static inline void req_ref_put(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_put_and_test(req)); -} - static inline void req_ref_get(struct io_kiocb *req) { WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); @@ -5468,12 +5462,14 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, static inline void io_poll_remove_entry(struct io_poll_iocb *poll) { - struct wait_queue_head *head = poll->head; + struct wait_queue_head *head = smp_load_acquire(&poll->head); - spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; - spin_unlock_irq(&head->lock); + if (head) { + spin_lock_irq(&head->lock); + list_del_init(&poll->wait.entry); + poll->head = NULL; + spin_unlock_irq(&head->lock); + } } static void io_poll_remove_entries(struct io_kiocb *req) @@ -5481,10 +5477,26 @@ static void io_poll_remove_entries(struct io_kiocb *req) struct io_poll_iocb *poll = io_poll_get_single(req); struct io_poll_iocb *poll_double = io_poll_get_double(req); - if (poll->head) - io_poll_remove_entry(poll); - if (poll_double && poll_double->head) + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + io_poll_remove_entry(poll); + if (poll_double) io_poll_remove_entry(poll_double); + rcu_read_unlock(); } /* @@ -5624,6 +5636,30 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, wait); __poll_t mask = key_to_poll(key); + if (unlikely(mask & POLLFREE)) { + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0); + + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; + } + /* for instances that support it check for an event match first */ if (mask && !(mask & poll->events)) return 0; @@ -6350,16 +6386,21 @@ static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - if (ret != -ENOENT) - return ret; + /* + * Fall-through even for -EALREADY, as we may have poll armed + * that need unarming. + */ + if (!ret) + return 0; spin_lock(&ctx->completion_lock); + ret = io_poll_cancel(ctx, sqe_addr, false); + if (ret != -ENOENT) + goto out; + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); spin_unlock_irq(&ctx->timeout_lock); - if (ret != -ENOENT) - goto out; - ret = io_poll_cancel(ctx, sqe_addr, false); out: spin_unlock(&ctx->completion_lock); return ret; diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c index b014f4638610..c03eba090368 100644 --- a/fs/ksmbd/asn1.c +++ b/fs/ksmbd/asn1.c @@ -21,101 +21,11 @@ #include "ksmbd_spnego_negtokeninit.asn1.h" #include "ksmbd_spnego_negtokentarg.asn1.h" -#define SPNEGO_OID_LEN 7 #define NTLMSSP_OID_LEN 10 -#define KRB5_OID_LEN 7 -#define KRB5U2U_OID_LEN 8 -#define MSKRB5_OID_LEN 7 -static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; -static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; -static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; -static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; -static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; static char NTLMSSP_OID_STR[NTLMSSP_OID_LEN] = { 0x2b, 0x06, 0x01, 0x04, 0x01, 0x82, 0x37, 0x02, 0x02, 0x0a }; -static bool -asn1_subid_decode(const unsigned char **begin, const unsigned char *end, - unsigned long *subid) -{ - const unsigned char *ptr = *begin; - unsigned char ch; - - *subid = 0; - - do { - if (ptr >= end) - return false; - - ch = *ptr++; - *subid <<= 7; - *subid |= ch & 0x7F; - } while ((ch & 0x80) == 0x80); - - *begin = ptr; - return true; -} - -static bool asn1_oid_decode(const unsigned char *value, size_t vlen, - unsigned long **oid, size_t *oidlen) -{ - const unsigned char *iptr = value, *end = value + vlen; - unsigned long *optr; - unsigned long subid; - - vlen += 1; - if (vlen < 2 || vlen > UINT_MAX / sizeof(unsigned long)) - goto fail_nullify; - - *oid = kmalloc(vlen * sizeof(unsigned long), GFP_KERNEL); - if (!*oid) - return false; - - optr = *oid; - - if (!asn1_subid_decode(&iptr, end, &subid)) - goto fail; - - if (subid < 40) { - optr[0] = 0; - optr[1] = subid; - } else if (subid < 80) { - optr[0] = 1; - optr[1] = subid - 40; - } else { - optr[0] = 2; - optr[1] = subid - 80; - } - - *oidlen = 2; - optr += 2; - - while (iptr < end) { - if (++(*oidlen) > vlen) - goto fail; - - if (!asn1_subid_decode(&iptr, end, optr++)) - goto fail; - } - return true; - -fail: - kfree(*oid); -fail_nullify: - *oid = NULL; - return false; -} - -static bool oid_eq(unsigned long *oid1, unsigned int oid1len, - unsigned long *oid2, unsigned int oid2len) -{ - if (oid1len != oid2len) - return false; - - return memcmp(oid1, oid2, oid1len) == 0; -} - int ksmbd_decode_negTokenInit(unsigned char *security_blob, int length, struct ksmbd_conn *conn) @@ -252,26 +162,18 @@ int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, int ksmbd_gssapi_this_mech(void *context, size_t hdrlen, unsigned char tag, const void *value, size_t vlen) { - unsigned long *oid; - size_t oidlen; - int err = 0; - - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) { - err = -EBADMSG; - goto out; - } + enum OID oid; - if (!oid_eq(oid, oidlen, SPNEGO_OID, SPNEGO_OID_LEN)) - err = -EBADMSG; - kfree(oid); -out: - if (err) { + oid = look_up_OID(value, vlen); + if (oid != OID_spnego) { char buf[50]; sprint_oid(value, vlen, buf, sizeof(buf)); ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; } - return err; + + return 0; } int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, @@ -279,37 +181,31 @@ int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, size_t vlen) { struct ksmbd_conn *conn = context; - unsigned long *oid; - size_t oidlen; + enum OID oid; int mech_type; - char buf[50]; - if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) - goto fail; - - if (oid_eq(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) + oid = look_up_OID(value, vlen); + if (oid == OID_ntlmssp) { mech_type = KSMBD_AUTH_NTLMSSP; - else if (oid_eq(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) + } else if (oid == OID_mskrb5) { mech_type = KSMBD_AUTH_MSKRB5; - else if (oid_eq(oid, oidlen, KRB5_OID, KRB5_OID_LEN)) + } else if (oid == OID_krb5) { mech_type = KSMBD_AUTH_KRB5; - else if (oid_eq(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN)) + } else if (oid == OID_krb5u2u) { mech_type = KSMBD_AUTH_KRB5U2U; - else - goto fail; + } else { + char buf[50]; + + sprint_oid(value, vlen, buf, sizeof(buf)); + ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; + } conn->auth_mechs |= mech_type; if (conn->preferred_auth_mech == 0) conn->preferred_auth_mech = mech_type; - kfree(oid); return 0; - -fail: - kfree(oid); - sprint_oid(value, vlen, buf, sizeof(buf)); - ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); - return -EBADMSG; } int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index 3503b1c48cb4..dc3d061edda9 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -215,7 +215,7 @@ out: * Return: 0 on success, error number on error */ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name) + int blen, char *domain_name, char *cryptkey) { char ntlmv2_hash[CIFS_ENCPWD_SIZE]; char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE]; @@ -256,7 +256,7 @@ int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, goto out; } - memcpy(construct, sess->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); + memcpy(construct, cryptkey, CIFS_CRYPTO_KEY_SIZE); memcpy(construct + CIFS_CRYPTO_KEY_SIZE, &ntlmv2->blob_signature, blen); rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), construct, len); @@ -295,7 +295,8 @@ out: * Return: 0 on success, error number on error */ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess) { char *domain_name; unsigned int nt_off, dn_off; @@ -324,7 +325,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, /* TODO : use domain name that imported from configuration file */ domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off, - dn_len, true, sess->conn->local_nls); + dn_len, true, conn->local_nls); if (IS_ERR(domain_name)) return PTR_ERR(domain_name); @@ -333,7 +334,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, domain_name); ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off), nt_len - CIFS_ENCPWD_SIZE, - domain_name); + domain_name, conn->ntlmssp.cryptkey); kfree(domain_name); return ret; } @@ -347,7 +348,7 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, * */ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess) + int blob_len, struct ksmbd_conn *conn) { if (blob_len < sizeof(struct negotiate_message)) { ksmbd_debug(AUTH, "negotiate blob len %d too small\n", @@ -361,7 +362,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, return -EINVAL; } - sess->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); + conn->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); return 0; } @@ -375,14 +376,14 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, */ unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess) + struct ksmbd_conn *conn) { struct target_info *tinfo; wchar_t *name; __u8 *target_name; unsigned int flags, blob_off, blob_len, type, target_info_len = 0; int len, uni_len, conv_len; - int cflags = sess->ntlmssp.client_flags; + int cflags = conn->ntlmssp.client_flags; memcpy(chgblob->Signature, NTLMSSP_SIGNATURE, 8); chgblob->MessageType = NtLmChallenge; @@ -403,7 +404,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, if (cflags & NTLMSSP_REQUEST_TARGET) flags |= NTLMSSP_REQUEST_TARGET; - if (sess->conn->use_spnego && + if (conn->use_spnego && (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC; @@ -414,7 +415,7 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, return -ENOMEM; conv_len = smb_strtoUTF16((__le16 *)name, ksmbd_netbios_name(), len, - sess->conn->local_nls); + conn->local_nls); if (conv_len < 0 || conv_len > len) { kfree(name); return -EINVAL; @@ -430,8 +431,8 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, chgblob->TargetName.BufferOffset = cpu_to_le32(blob_off); /* Initialize random conn challenge */ - get_random_bytes(sess->ntlmssp.cryptkey, sizeof(__u64)); - memcpy(chgblob->Challenge, sess->ntlmssp.cryptkey, + get_random_bytes(conn->ntlmssp.cryptkey, sizeof(__u64)); + memcpy(chgblob->Challenge, conn->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); /* Add Target Information to security buffer */ diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h index 9c2d4badd05d..95629651cf26 100644 --- a/fs/ksmbd/auth.h +++ b/fs/ksmbd/auth.h @@ -38,16 +38,16 @@ struct kvec; int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, unsigned int nvec, int enc); void ksmbd_copy_gss_neg_header(void *buf); -int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf); int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, - int blen, char *domain_name); + int blen, char *domain_name, char *cryptkey); int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn, + struct ksmbd_session *sess); int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, - int blob_len, struct ksmbd_session *sess); + int blob_len, struct ksmbd_conn *conn); unsigned int ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, - struct ksmbd_session *sess); + struct ksmbd_conn *conn); int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, int in_len, char *out_blob, int *out_len); int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index 83a94d0bb480..208d2cff7bd3 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -62,6 +62,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); conn->total_credits = 1; + conn->outstanding_credits = 1; init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); @@ -386,17 +387,24 @@ out: static void stop_sessions(void) { struct ksmbd_conn *conn; + struct ksmbd_transport *t; again: read_lock(&conn_list_lock); list_for_each_entry(conn, &conn_list, conns_list) { struct task_struct *task; - task = conn->transport->handler; + t = conn->transport; + task = t->handler; if (task) ksmbd_debug(CONN, "Stop session handler %s/%d\n", task->comm, task_pid_nr(task)); conn->status = KSMBD_SESS_EXITING; + if (t->ops->shutdown) { + read_unlock(&conn_list_lock); + t->ops->shutdown(t); + read_lock(&conn_list_lock); + } } read_unlock(&conn_list_lock); diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h index e5403c587a58..7a59aacb5daa 100644 --- a/fs/ksmbd/connection.h +++ b/fs/ksmbd/connection.h @@ -61,8 +61,8 @@ struct ksmbd_conn { atomic_t req_running; /* References which are made for this Server object*/ atomic_t r_count; - unsigned short total_credits; - unsigned short max_credits; + unsigned int total_credits; + unsigned int outstanding_credits; spinlock_t credits_lock; wait_queue_head_t req_running_q; /* Lock to protect requests list*/ @@ -72,12 +72,7 @@ struct ksmbd_conn { int connection_type; struct ksmbd_stats stats; char ClientGUID[SMB2_CLIENT_GUID_SIZE]; - union { - /* pending trans request table */ - struct trans_state *recent_trans; - /* Used by ntlmssp */ - char *ntlmssp_cryptkey; - }; + struct ntlmssp_auth ntlmssp; spinlock_t llist_lock; struct list_head lock_list; @@ -122,6 +117,7 @@ struct ksmbd_conn_ops { struct ksmbd_transport_ops { int (*prepare)(struct ksmbd_transport *t); void (*disconnect)(struct ksmbd_transport *t); + void (*shutdown)(struct ksmbd_transport *t); int (*read)(struct ksmbd_transport *t, char *buf, unsigned int size); int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, int size, bool need_invalidate_rkey, diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index c6718a05d347..71bfb7de4472 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -103,6 +103,8 @@ struct ksmbd_startup_request { * we set the SPARSE_FILES bit (0x40). */ __u32 sub_auth[3]; /* Subauth value for Security ID */ + __u32 smb2_max_credits; /* MAX credits */ + __u32 reserved[128]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; }; @@ -113,7 +115,7 @@ struct ksmbd_startup_request { * IPC request to shutdown ksmbd server. */ struct ksmbd_shutdown_request { - __s32 reserved; + __s32 reserved[16]; }; /* @@ -122,6 +124,7 @@ struct ksmbd_shutdown_request { struct ksmbd_login_request { __u32 handle; __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -135,6 +138,7 @@ struct ksmbd_login_response { __u16 status; __u16 hash_sz; /* hash size */ __s8 hash[KSMBD_REQ_MAX_HASH_SZ]; /* password hash */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -143,6 +147,7 @@ struct ksmbd_login_response { struct ksmbd_share_config_request { __u32 handle; __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; /* share name */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -157,6 +162,7 @@ struct ksmbd_share_config_response { __u16 force_directory_mode; __u16 force_uid; __u16 force_gid; + __u32 reserved[128]; /* Reserved room */ __u32 veto_list_sz; __s8 ____payload[]; }; @@ -187,6 +193,7 @@ struct ksmbd_tree_connect_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; __s8 share[KSMBD_REQ_MAX_SHARE_NAME]; __s8 peer_addr[64]; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -196,6 +203,7 @@ struct ksmbd_tree_connect_response { __u32 handle; __u16 status; __u16 connection_flags; + __u32 reserved[16]; /* Reserved room */ }; /* @@ -204,6 +212,7 @@ struct ksmbd_tree_connect_response { struct ksmbd_tree_disconnect_request { __u64 session_id; /* session id */ __u64 connect_id; /* tree connection id */ + __u32 reserved[16]; /* Reserved room */ }; /* @@ -212,6 +221,7 @@ struct ksmbd_tree_disconnect_request { struct ksmbd_logout_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ __u32 account_flags; + __u32 reserved[16]; /* Reserved room */ }; /* diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c index 1019d3677d55..279d00feff21 100644 --- a/fs/ksmbd/mgmt/user_config.c +++ b/fs/ksmbd/mgmt/user_config.c @@ -67,3 +67,13 @@ int ksmbd_anonymous_user(struct ksmbd_user *user) return 1; return 0; } + +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2) +{ + if (strcmp(u1->name, u2->name)) + return false; + if (memcmp(u1->passkey, u2->passkey, u1->passkey_sz)) + return false; + + return true; +} diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h index aff80b029579..6a44109617f1 100644 --- a/fs/ksmbd/mgmt/user_config.h +++ b/fs/ksmbd/mgmt/user_config.h @@ -64,4 +64,5 @@ struct ksmbd_user *ksmbd_login_user(const char *account); struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); void ksmbd_free_user(struct ksmbd_user *user); int ksmbd_anonymous_user(struct ksmbd_user *user); +bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2); #endif /* __USER_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h index 82289c3cbd2b..e241f16a3851 100644 --- a/fs/ksmbd/mgmt/user_session.h +++ b/fs/ksmbd/mgmt/user_session.h @@ -45,7 +45,6 @@ struct ksmbd_session { int state; __u8 *Preauth_HashValue; - struct ntlmssp_auth ntlmssp; char sess_key[CIFS_KEY_SIZE]; struct hlist_node hlist; diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 50d0b1022289..4a9460153b59 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -289,7 +289,7 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge); void *__hdr = hdr; - int ret; + int ret = 0; switch (hdr->Command) { case SMB2_QUERY_INFO: @@ -326,21 +326,27 @@ static int smb2_validate_credit_charge(struct ksmbd_conn *conn, ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n", credit_charge, calc_credit_num); return 1; - } else if (credit_charge > conn->max_credits) { + } else if (credit_charge > conn->vals->max_credits) { ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge); return 1; } spin_lock(&conn->credits_lock); - if (credit_charge <= conn->total_credits) { - conn->total_credits -= credit_charge; - ret = 0; - } else { + if (credit_charge > conn->total_credits) { ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", credit_charge, conn->total_credits); ret = 1; } + + if ((u64)conn->outstanding_credits + credit_charge > conn->vals->max_credits) { + ksmbd_debug(SMB, "Limits exceeding the maximum allowable outstanding requests, given : %u, pending : %u\n", + credit_charge, conn->outstanding_credits); + ret = 1; + } else + conn->outstanding_credits += credit_charge; + spin_unlock(&conn->credits_lock); + return ret; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 02a44d28bdaf..ab23da2120b9 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -19,6 +19,7 @@ static struct smb_version_values smb21_server_values = { .max_read_size = SMB21_DEFAULT_IOSIZE, .max_write_size = SMB21_DEFAULT_IOSIZE, .max_trans_size = SMB21_DEFAULT_IOSIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -44,6 +45,7 @@ static struct smb_version_values smb30_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -70,6 +72,7 @@ static struct smb_version_values smb302_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -96,6 +99,7 @@ static struct smb_version_values smb311_server_values = { .max_read_size = SMB3_DEFAULT_IOSIZE, .max_write_size = SMB3_DEFAULT_IOSIZE, .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .max_credits = SMB2_MAX_CREDITS, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, .shared_lock_type = SMB2_LOCKFLAG_SHARED, @@ -197,7 +201,6 @@ void init_smb2_1_server(struct ksmbd_conn *conn) conn->ops = &smb2_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -215,7 +218,6 @@ void init_smb3_0_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -240,7 +242,6 @@ void init_smb3_02_server(struct ksmbd_conn *conn) conn->ops = &smb3_0_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -265,7 +266,6 @@ int init_smb3_11_server(struct ksmbd_conn *conn) conn->ops = &smb3_11_server_ops; conn->cmds = smb2_0_server_cmds; conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); - conn->max_credits = SMB2_MAX_CREDITS; conn->signing_algorithm = SIGNING_ALG_AES_CMAC_LE; if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) @@ -304,3 +304,11 @@ void init_smb2_max_trans_size(unsigned int sz) smb302_server_values.max_trans_size = sz; smb311_server_values.max_trans_size = sz; } + +void init_smb2_max_credits(unsigned int sz) +{ + smb21_server_values.max_credits = sz; + smb30_server_values.max_credits = sz; + smb302_server_values.max_credits = sz; + smb311_server_values.max_credits = sz; +} diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index b8b3a4c28b74..1866c81c5c99 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -299,16 +299,15 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned short credits_requested; + unsigned short credits_requested, aux_max; unsigned short credit_charge, credits_granted = 0; - unsigned short aux_max, aux_credits; if (work->send_no_response) return 0; hdr->CreditCharge = req_hdr->CreditCharge; - if (conn->total_credits > conn->max_credits) { + if (conn->total_credits > conn->vals->max_credits) { hdr->CreditRequest = 0; pr_err("Total credits overflow: %d\n", conn->total_credits); return -EINVAL; @@ -316,6 +315,14 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) credit_charge = max_t(unsigned short, le16_to_cpu(req_hdr->CreditCharge), 1); + if (credit_charge > conn->total_credits) { + ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", + credit_charge, conn->total_credits); + return -EINVAL; + } + + conn->total_credits -= credit_charge; + conn->outstanding_credits -= credit_charge; credits_requested = max_t(unsigned short, le16_to_cpu(req_hdr->CreditRequest), 1); @@ -325,16 +332,14 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) * TODO: Need to adjuct CreditRequest value according to * current cpu load */ - aux_credits = credits_requested - 1; if (hdr->Command == SMB2_NEGOTIATE) - aux_max = 0; + aux_max = 1; else - aux_max = conn->max_credits - credit_charge; - aux_credits = min_t(unsigned short, aux_credits, aux_max); - credits_granted = credit_charge + aux_credits; + aux_max = conn->vals->max_credits - credit_charge; + credits_granted = min_t(unsigned short, credits_requested, aux_max); - if (conn->max_credits - conn->total_credits < credits_granted) - credits_granted = conn->max_credits - + if (conn->vals->max_credits - conn->total_credits < credits_granted) + credits_granted = conn->vals->max_credits - conn->total_credits; conn->total_credits += credits_granted; @@ -610,16 +615,14 @@ static void destroy_previous_session(struct ksmbd_user *user, u64 id) /** * smb2_get_name() - get filename string from on the wire smb format - * @share: ksmbd_share_config pointer * @src: source buffer * @maxlen: maxlen of source string - * @nls_table: nls_table pointer + * @local_nls: nls_table pointer * * Return: matching converted filename on success, otherwise error ptr */ static char * -smb2_get_name(struct ksmbd_share_config *share, const char *src, - const int maxlen, struct nls_table *local_nls) +smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) { char *name; @@ -1303,7 +1306,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, int sz, rc; ksmbd_debug(SMB, "negotiate phase\n"); - rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->conn); if (rc) return rc; @@ -1313,7 +1316,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, memset(chgblob, 0, sizeof(struct challenge_message)); if (!work->conn->use_spnego) { - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) return -ENOMEM; @@ -1329,7 +1332,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, return -ENOMEM; chgblob = (struct challenge_message *)neg_blob; - sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->conn); if (sz < 0) { rc = -ENOMEM; goto out; @@ -1450,60 +1453,62 @@ static int ntlm_authenticate(struct ksmbd_work *work) ksmbd_free_user(user); return 0; } - ksmbd_free_user(sess->user); - } - sess->user = user; - if (user_guest(sess->user)) { - if (conn->sign) { - ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n"); + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_free_user(user); return -EPERM; } + ksmbd_free_user(user); + } else { + sess->user = user; + } + if (user_guest(sess->user)) { rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE; } else { struct authenticate_message *authblob; authblob = user_authblob(conn, req); sz = le16_to_cpu(req->SecurityBufferLength); - rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, sess); + rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess); if (rc) { set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); ksmbd_debug(SMB, "authentication failed\n"); return -EPERM; } + } - /* - * If session state is SMB2_SESSION_VALID, We can assume - * that it is reauthentication. And the user/password - * has been verified, so return it here. - */ - if (sess->state == SMB2_SESSION_VALID) { - if (conn->binding) - goto binding_session; - return 0; - } + /* + * If session state is SMB2_SESSION_VALID, We can assume + * that it is reauthentication. And the user/password + * has been verified, so return it here. + */ + if (sess->state == SMB2_SESSION_VALID) { + if (conn->binding) + goto binding_session; + return 0; + } - if ((conn->sign || server_conf.enforced_signing) || - (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) - sess->sign = true; + if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE && + (conn->sign || server_conf.enforced_signing)) || + (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) + sess->sign = true; - if (smb3_encryption_negotiated(conn) && - !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { - rc = conn->ops->generate_encryptionkey(sess); - if (rc) { - ksmbd_debug(SMB, - "SMB3 encryption key generation failed\n"); - return -EINVAL; - } - sess->enc = true; - rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; - /* - * signing is disable if encryption is enable - * on this session - */ - sess->sign = false; + if (smb3_encryption_negotiated(conn) && + !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { + rc = conn->ops->generate_encryptionkey(sess); + if (rc) { + ksmbd_debug(SMB, + "SMB3 encryption key generation failed\n"); + return -EINVAL; } + sess->enc = true; + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + /* + * signing is disable if encryption is enable + * on this session + */ + sess->sign = false; } binding_session: @@ -2057,9 +2062,6 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); - /* Got a valid session, set connection state */ - WARN_ON(sess->conn != conn); - /* setting CifsExiting here may race with start_tcp_sess */ ksmbd_conn_set_need_reconnect(work); ksmbd_close_session_fds(work); @@ -2530,8 +2532,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - name = smb2_get_name(share, - req->Buffer, + name = smb2_get_name(req->Buffer, le16_to_cpu(req->NameLength), work->conn->local_nls); if (IS_ERR(name)) { @@ -3392,7 +3393,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level) * @conn: connection instance * @info_level: smb information level * @d_info: structure included variables for query dir - * @user_ns: user namespace * @ksmbd_kstat: ksmbd wrapper of dirent stat information * * if directory has many entries, find first can't read it fully. @@ -4018,6 +4018,7 @@ err_out2: * buffer_check_err() - helper function to check buffer errors * @reqOutputBufferLength: max buffer length expected in command response * @rsp: query info response buffer contains output buffer length + * @rsp_org: base response buffer pointer in case of chained response * @infoclass_size: query info class response buffer size * * Return: 0 on success, otherwise error @@ -5398,8 +5399,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; } - new_name = smb2_get_name(share, - file_info->FileName, + new_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(new_name)) { @@ -5510,8 +5510,7 @@ static int smb2_create_link(struct ksmbd_work *work, if (!pathname) return -ENOMEM; - link_name = smb2_get_name(share, - file_info->FileName, + link_name = smb2_get_name(file_info->FileName, le32_to_cpu(file_info->FileNameLength), local_nls); if (IS_ERR(link_name) || S_ISDIR(file_inode(filp)->i_mode)) { @@ -5849,7 +5848,7 @@ static int set_file_mode_info(struct ksmbd_file *fp, * smb2_set_info_file() - handler for smb2 set info command * @work: smb work containing set info command buffer * @fp: ksmbd_file pointer - * @info_class: smb2 set info class + * @req: request buffer pointer * @share: ksmbd_share_config pointer * * Return: 0 on success, otherwise error @@ -6121,25 +6120,33 @@ out: return err; } -static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, - struct smb2_read_req *req, void *data_buf, - size_t length) +static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work, + struct smb2_buffer_desc_v1 *desc, + __le32 Channel, + __le16 ChannelInfoOffset, + __le16 ChannelInfoLength) { - struct smb2_buffer_desc_v1 *desc = - (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - int err; - if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) + Channel != SMB2_CHANNEL_RDMA_V1) return -EINVAL; - if (req->ReadChannelInfoOffset == 0 || - le16_to_cpu(req->ReadChannelInfoLength) < sizeof(*desc)) + if (ChannelInfoOffset == 0 || + le16_to_cpu(ChannelInfoLength) < sizeof(*desc)) return -EINVAL; work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); + (Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); work->remote_key = le32_to_cpu(desc->token); + return 0; +} + +static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, + struct smb2_read_req *req, void *data_buf, + size_t length) +{ + struct smb2_buffer_desc_v1 *desc = + (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; + int err; err = ksmbd_conn_rdma_write(work->conn, data_buf, length, le32_to_cpu(desc->token), @@ -6162,7 +6169,7 @@ int smb2_read(struct ksmbd_work *work) struct ksmbd_conn *conn = work->conn; struct smb2_read_req *req; struct smb2_read_rsp *rsp; - struct ksmbd_file *fp; + struct ksmbd_file *fp = NULL; loff_t offset; size_t length, mincount; ssize_t nbytes = 0, remain_bytes = 0; @@ -6176,6 +6183,18 @@ int smb2_read(struct ksmbd_work *work) return smb2_read_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || + req->Channel == SMB2_CHANNEL_RDMA_V1) { + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + &req->Buffer[0], + req->Channel, + req->ReadChannelInfoOffset, + req->ReadChannelInfoLength); + if (err) + goto out; + } + fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), le64_to_cpu(req->PersistentFileId)); if (!fp) { @@ -6361,21 +6380,6 @@ static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; - if (work->conn->dialect == SMB30_PROT_ID && - req->Channel != SMB2_CHANNEL_RDMA_V1) - return -EINVAL; - - if (req->Length != 0 || req->DataOffset != 0) - return -EINVAL; - - if (req->WriteChannelInfoOffset == 0 || - le16_to_cpu(req->WriteChannelInfoLength) < sizeof(*desc)) - return -EINVAL; - - work->need_invalidate_rkey = - (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); - work->remote_key = le32_to_cpu(desc->token); - data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); if (!data_buf) return -ENOMEM; @@ -6422,6 +6426,20 @@ int smb2_write(struct ksmbd_work *work) return smb2_write_pipe(work); } + if (req->Channel == SMB2_CHANNEL_RDMA_V1 || + req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + if (req->Length != 0 || req->DataOffset != 0) + return -EINVAL; + err = smb2_set_remote_key_for_rdma(work, + (struct smb2_buffer_desc_v1 *) + &req->Buffer[0], + req->Channel, + req->WriteChannelInfoOffset, + req->WriteChannelInfoLength); + if (err) + goto out; + } + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); err = -EACCES; @@ -7243,15 +7261,10 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, struct sockaddr_storage_rsp *sockaddr_storage; unsigned int flags; unsigned long long speed; - struct sockaddr_in6 *csin6 = (struct sockaddr_in6 *)&conn->peer_addr; rtnl_lock(); for_each_netdev(&init_net, netdev) { - if (out_buf_len < - nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { - rtnl_unlock(); - return -ENOSPC; - } + bool ipv4_set = false; if (netdev->type == ARPHRD_LOOPBACK) continue; @@ -7259,12 +7272,20 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, flags = dev_get_flags(netdev); if (!(flags & IFF_RUNNING)) continue; +ipv6_retry: + if (out_buf_len < + nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { + rtnl_unlock(); + return -ENOSPC; + } nii_rsp = (struct network_interface_info_ioctl_rsp *) &rsp->Buffer[nbytes]; nii_rsp->IfIndex = cpu_to_le32(netdev->ifindex); nii_rsp->Capability = 0; + if (netdev->real_num_tx_queues > 1) + nii_rsp->Capability |= cpu_to_le32(RSS_CAPABLE); if (ksmbd_rdma_capable_netdev(netdev)) nii_rsp->Capability |= cpu_to_le32(RDMA_CAPABLE); @@ -7289,8 +7310,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, nii_rsp->SockAddr_Storage; memset(sockaddr_storage, 0, 128); - if (conn->peer_addr.ss_family == PF_INET || - ipv6_addr_v4mapped(&csin6->sin6_addr)) { + if (!ipv4_set) { struct in_device *idev; sockaddr_storage->Family = cpu_to_le16(INTERNETWORK); @@ -7301,6 +7321,9 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, continue; sockaddr_storage->addr4.IPv4address = idev_ipv4_address(idev); + nbytes += sizeof(struct network_interface_info_ioctl_rsp); + ipv4_set = true; + goto ipv6_retry; } else { struct inet6_dev *idev6; struct inet6_ifaddr *ifa; @@ -7322,9 +7345,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, break; } sockaddr_storage->addr6.ScopeId = 0; + nbytes += sizeof(struct network_interface_info_ioctl_rsp); } - - nbytes += sizeof(struct network_interface_info_ioctl_rsp); } rtnl_unlock(); diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index 4a3e4339d4c4..725b800c29c8 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -980,6 +980,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn); void init_smb2_max_read_size(unsigned int sz); void init_smb2_max_write_size(unsigned int sz); void init_smb2_max_trans_size(unsigned int sz); +void init_smb2_max_credits(unsigned int sz); bool is_smb2_neg_cmd(struct ksmbd_work *work); bool is_smb2_rsp(struct ksmbd_work *work); diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 50590842b651..e1369b4345a9 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -365,6 +365,7 @@ struct smb_version_values { __u32 max_read_size; __u32 max_write_size; __u32 max_trans_size; + __u32 max_credits; __u32 large_lock_type; __u32 exclusive_lock_type; __u32 shared_lock_type; diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 1acf1892a466..3ad6881e0f7e 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -301,6 +301,8 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) init_smb2_max_write_size(req->smb2_max_write); if (req->smb2_max_trans) init_smb2_max_trans_size(req->smb2_max_trans); + if (req->smb2_max_credits) + init_smb2_max_credits(req->smb2_max_credits); ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 7e57cbb0bb35..3c1ec1ac0b27 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -34,7 +34,8 @@ #include "smbstatus.h" #include "transport_rdma.h" -#define SMB_DIRECT_PORT 5445 +#define SMB_DIRECT_PORT_IWARP 5445 +#define SMB_DIRECT_PORT_INFINIBAND 445 #define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) @@ -60,6 +61,10 @@ * as defined in [MS-SMBD] 3.1.1.1 * Those may change after a SMB_DIRECT negotiation */ + +/* Set 445 port to SMB Direct port by default */ +static int smb_direct_port = SMB_DIRECT_PORT_INFINIBAND; + /* The local peer's maximum number of credits to grant to the peer */ static int smb_direct_receive_credit_max = 255; @@ -75,10 +80,18 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024; /* The maximum single-message size which can be received */ static int smb_direct_max_receive_size = 8192; -static int smb_direct_max_read_write_size = 1024 * 1024; +static int smb_direct_max_read_write_size = 1048512; static int smb_direct_max_outstanding_rw_ops = 8; +static LIST_HEAD(smb_direct_device_list); +static DEFINE_RWLOCK(smb_direct_device_lock); + +struct smb_direct_device { + struct ib_device *ib_dev; + struct list_head list; +}; + static struct smb_direct_listener { struct rdma_cm_id *cm_id; } smb_direct_listener; @@ -415,6 +428,7 @@ static void free_transport(struct smb_direct_transport *t) if (t->qp) { ib_drain_qp(t->qp); + ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs); ib_destroy_qp(t->qp); } @@ -555,6 +569,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) } t->negotiation_requested = true; t->full_packet_received = true; + enqueue_reassembly(t, recvmsg, 0); wake_up_interruptible(&t->wait_status); break; case SMB_DIRECT_MSG_DATA_TRANSFER: { @@ -1438,6 +1453,15 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) free_transport(st); } +static void smb_direct_shutdown(struct ksmbd_transport *t) +{ + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + + ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", st->cm_id); + + smb_direct_disconnect_rdma_work(&st->disconnect_work); +} + static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -1581,19 +1605,13 @@ static int smb_direct_accept_client(struct smb_direct_transport *t) pr_err("error at rdma_accept: %d\n", ret); return ret; } - - wait_event_interruptible(t->wait_status, - t->status != SMB_DIRECT_CS_NEW); - if (t->status != SMB_DIRECT_CS_CONNECTED) - return -ENOTCONN; return 0; } -static int smb_direct_negotiate(struct smb_direct_transport *t) +static int smb_direct_prepare_negotiation(struct smb_direct_transport *t) { int ret; struct smb_direct_recvmsg *recvmsg; - struct smb_direct_negotiate_req *req; recvmsg = get_free_recvmsg(t); if (!recvmsg) @@ -1603,44 +1621,20 @@ static int smb_direct_negotiate(struct smb_direct_transport *t) ret = smb_direct_post_recv(t, recvmsg); if (ret) { pr_err("Can't post recv: %d\n", ret); - goto out; + goto out_err; } t->negotiation_requested = false; ret = smb_direct_accept_client(t); if (ret) { pr_err("Can't accept client\n"); - goto out; + goto out_err; } smb_direct_post_recv_credits(&t->post_recv_credits_work.work); - - ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(t->wait_status, - t->negotiation_requested || - t->status == SMB_DIRECT_CS_DISCONNECTED, - SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); - if (ret <= 0 || t->status == SMB_DIRECT_CS_DISCONNECTED) { - ret = ret < 0 ? ret : -ETIMEDOUT; - goto out; - } - - ret = smb_direct_check_recvmsg(recvmsg); - if (ret == -ECONNABORTED) - goto out; - - req = (struct smb_direct_negotiate_req *)recvmsg->packet; - t->max_recv_size = min_t(int, t->max_recv_size, - le32_to_cpu(req->preferred_send_size)); - t->max_send_size = min_t(int, t->max_send_size, - le32_to_cpu(req->max_receive_size)); - t->max_fragmented_send_size = - le32_to_cpu(req->max_fragmented_size); - - ret = smb_direct_send_negotiate_response(t, ret); -out: - if (recvmsg) - put_recvmsg(t, recvmsg); + return 0; +out_err: + put_recvmsg(t, recvmsg); return ret; } @@ -1724,7 +1718,9 @@ static int smb_direct_init_params(struct smb_direct_transport *t, cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; cap->max_inline_data = 0; - cap->max_rdma_ctxs = 0; + cap->max_rdma_ctxs = + rdma_rw_mr_factor(device, t->cm_id->port_num, max_pages) * + smb_direct_max_outstanding_rw_ops; return 0; } @@ -1806,6 +1802,7 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, { int ret; struct ib_qp_init_attr qp_attr; + int pages_per_rw; t->pd = ib_alloc_pd(t->cm_id->device, 0); if (IS_ERR(t->pd)) { @@ -1853,6 +1850,23 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t, t->qp = t->cm_id->qp; t->cm_id->event_handler = smb_direct_cm_handler; + pages_per_rw = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; + if (pages_per_rw > t->cm_id->device->attrs.max_sgl_rd) { + int pages_per_mr, mr_count; + + pages_per_mr = min_t(int, pages_per_rw, + t->cm_id->device->attrs.max_fast_reg_page_list_len); + mr_count = DIV_ROUND_UP(pages_per_rw, pages_per_mr) * + atomic_read(&t->rw_avail_ops); + ret = ib_mr_pool_init(t->qp, &t->qp->rdma_mrs, mr_count, + IB_MR_TYPE_MEM_REG, pages_per_mr, 0); + if (ret) { + pr_err("failed to init mr pool count %d pages %d\n", + mr_count, pages_per_mr); + goto err; + } + } + return 0; err: if (t->qp) { @@ -1877,6 +1891,49 @@ err: static int smb_direct_prepare(struct ksmbd_transport *t) { struct smb_direct_transport *st = smb_trans_direct_transfort(t); + struct smb_direct_recvmsg *recvmsg; + struct smb_direct_negotiate_req *req; + int ret; + + ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); + ret = wait_event_interruptible_timeout(st->wait_status, + st->negotiation_requested || + st->status == SMB_DIRECT_CS_DISCONNECTED, + SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); + if (ret <= 0 || st->status == SMB_DIRECT_CS_DISCONNECTED) + return ret < 0 ? ret : -ETIMEDOUT; + + recvmsg = get_first_reassembly(st); + if (!recvmsg) + return -ECONNABORTED; + + ret = smb_direct_check_recvmsg(recvmsg); + if (ret == -ECONNABORTED) + goto out; + + req = (struct smb_direct_negotiate_req *)recvmsg->packet; + st->max_recv_size = min_t(int, st->max_recv_size, + le32_to_cpu(req->preferred_send_size)); + st->max_send_size = min_t(int, st->max_send_size, + le32_to_cpu(req->max_receive_size)); + st->max_fragmented_send_size = + le32_to_cpu(req->max_fragmented_size); + st->max_fragmented_recv_size = + (st->recv_credit_max * st->max_recv_size) / 2; + + ret = smb_direct_send_negotiate_response(st, ret); +out: + spin_lock_irq(&st->reassembly_queue_lock); + st->reassembly_queue_length--; + list_del(&recvmsg->list); + spin_unlock_irq(&st->reassembly_queue_lock); + put_recvmsg(st, recvmsg); + + return ret; +} + +static int smb_direct_connect(struct smb_direct_transport *st) +{ int ret; struct ib_qp_cap qp_cap; @@ -1898,13 +1955,11 @@ static int smb_direct_prepare(struct ksmbd_transport *t) return ret; } - ret = smb_direct_negotiate(st); + ret = smb_direct_prepare_negotiation(st); if (ret) { pr_err("Can't negotiate: %d\n", ret); return ret; } - - st->status = SMB_DIRECT_CS_CONNECTED; return 0; } @@ -1920,6 +1975,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) { struct smb_direct_transport *t; + int ret; if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { ksmbd_debug(RDMA, @@ -1932,18 +1988,23 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) if (!t) return -ENOMEM; + ret = smb_direct_connect(t); + if (ret) + goto out_err; + KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, KSMBD_TRANS(t)->conn, "ksmbd:r%u", - SMB_DIRECT_PORT); + smb_direct_port); if (IS_ERR(KSMBD_TRANS(t)->handler)) { - int ret = PTR_ERR(KSMBD_TRANS(t)->handler); - + ret = PTR_ERR(KSMBD_TRANS(t)->handler); pr_err("Can't start thread\n"); - free_transport(t); - return ret; + goto out_err; } return 0; +out_err: + free_transport(t); + return ret; } static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, @@ -2007,12 +2068,65 @@ err: return ret; } +static int smb_direct_ib_client_add(struct ib_device *ib_dev) +{ + struct smb_direct_device *smb_dev; + + /* Set 5445 port if device type is iWARP(No IB) */ + if (ib_dev->node_type != RDMA_NODE_IB_CA) + smb_direct_port = SMB_DIRECT_PORT_IWARP; + + if (!ib_dev->ops.get_netdev || + !rdma_frwr_is_supported(&ib_dev->attrs)) + return 0; + + smb_dev = kzalloc(sizeof(*smb_dev), GFP_KERNEL); + if (!smb_dev) + return -ENOMEM; + smb_dev->ib_dev = ib_dev; + + write_lock(&smb_direct_device_lock); + list_add(&smb_dev->list, &smb_direct_device_list); + write_unlock(&smb_direct_device_lock); + + ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); + return 0; +} + +static void smb_direct_ib_client_remove(struct ib_device *ib_dev, + void *client_data) +{ + struct smb_direct_device *smb_dev, *tmp; + + write_lock(&smb_direct_device_lock); + list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { + if (smb_dev->ib_dev == ib_dev) { + list_del(&smb_dev->list); + kfree(smb_dev); + break; + } + } + write_unlock(&smb_direct_device_lock); +} + +static struct ib_client smb_direct_ib_client = { + .name = "ksmbd_smb_direct_ib", + .add = smb_direct_ib_client_add, + .remove = smb_direct_ib_client_remove, +}; + int ksmbd_rdma_init(void) { int ret; smb_direct_listener.cm_id = NULL; + ret = ib_register_client(&smb_direct_ib_client); + if (ret) { + pr_err("failed to ib_register_client\n"); + return ret; + } + /* When a client is running out of send credits, the credits are * granted by the server's sending a packet using this queue. * This avoids the situation that a clients cannot send packets @@ -2023,7 +2137,7 @@ int ksmbd_rdma_init(void) if (!smb_direct_wq) return -ENOMEM; - ret = smb_direct_listen(SMB_DIRECT_PORT); + ret = smb_direct_listen(smb_direct_port); if (ret) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; @@ -2036,36 +2150,67 @@ int ksmbd_rdma_init(void) return 0; } -int ksmbd_rdma_destroy(void) +void ksmbd_rdma_destroy(void) { - if (smb_direct_listener.cm_id) - rdma_destroy_id(smb_direct_listener.cm_id); + if (!smb_direct_listener.cm_id) + return; + + ib_unregister_client(&smb_direct_ib_client); + rdma_destroy_id(smb_direct_listener.cm_id); + smb_direct_listener.cm_id = NULL; if (smb_direct_wq) { destroy_workqueue(smb_direct_wq); smb_direct_wq = NULL; } - return 0; } bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { - struct ib_device *ibdev; + struct smb_direct_device *smb_dev; + int i; bool rdma_capable = false; - ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); - if (ibdev) { - if (rdma_frwr_is_supported(&ibdev->attrs)) - rdma_capable = true; - ib_device_put(ibdev); + read_lock(&smb_direct_device_lock); + list_for_each_entry(smb_dev, &smb_direct_device_list, list) { + for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { + struct net_device *ndev; + + ndev = smb_dev->ib_dev->ops.get_netdev(smb_dev->ib_dev, + i + 1); + if (!ndev) + continue; + + if (ndev == netdev) { + dev_put(ndev); + rdma_capable = true; + goto out; + } + dev_put(ndev); + } + } +out: + read_unlock(&smb_direct_device_lock); + + if (rdma_capable == false) { + struct ib_device *ibdev; + + ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); + if (ibdev) { + if (rdma_frwr_is_supported(&ibdev->attrs)) + rdma_capable = true; + ib_device_put(ibdev); + } } + return rdma_capable; } static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { .prepare = smb_direct_prepare, .disconnect = smb_direct_disconnect, + .shutdown = smb_direct_shutdown, .writev = smb_direct_writev, .read = smb_direct_read, .rdma_read = smb_direct_rdma_read, diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h index 0fa8adc0776f..5567d93a6f96 100644 --- a/fs/ksmbd/transport_rdma.h +++ b/fs/ksmbd/transport_rdma.h @@ -7,8 +7,6 @@ #ifndef __KSMBD_TRANSPORT_RDMA_H__ #define __KSMBD_TRANSPORT_RDMA_H__ -#define SMB_DIRECT_PORT 5445 - /* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ struct smb_direct_negotiate_req { __le16 min_version; @@ -52,7 +50,7 @@ struct smb_direct_data_transfer { #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); -int ksmbd_rdma_destroy(void); +void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); #else static inline int ksmbd_rdma_init(void) { return 0; } diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index c14320e03b69..82a1429bbe12 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -404,7 +404,7 @@ static int create_socket(struct interface *iface) &ksmbd_socket); if (ret) { pr_err("Can't create socket for ipv4: %d\n", ret); - goto out_error; + goto out_clear; } sin.sin_family = PF_INET; @@ -462,6 +462,7 @@ static int create_socket(struct interface *iface) out_error: tcp_destroy_socket(ksmbd_socket); +out_clear: iface->ksmbd_socket = NULL; return ret; } diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 448576fbe4b7..36239ce31afd 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -96,16 +96,6 @@ struct ksmbd_file { int durable_timeout; - /* for SMB1 */ - int pid; - - /* conflict lock fail count for SMB1 */ - unsigned int cflock_cnt; - /* last lock failure start offset for SMB1 */ - unsigned long long llock_fstart; - - int dirent_offset; - /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; int dot_dotdot[2]; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index bc3e2cd4117f..063dd16d75b5 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -195,12 +195,12 @@ void nilfs_page_bug(struct page *page) */ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) { - struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + struct buffer_head *dbh, *dbufs, *sbh; unsigned long mask = NILFS_BUFFER_INHERENT_BITS; BUG_ON(PageWriteback(dst)); - sbh = sbufs = page_buffers(src); + sbh = page_buffers(src); if (!page_has_buffers(dst)) create_empty_buffers(dst, sbh->b_size, 0); diff --git a/fs/proc/array.c b/fs/proc/array.c index 43a7abde9e42..fd8b0c12b2cb 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -92,6 +92,7 @@ #include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <linux/fs_struct.h> +#include <linux/kthread.h> #include <asm/processor.h> #include "internal.h" @@ -102,6 +103,8 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); + else if (p->flags & PF_KTHREAD) + get_kthread_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); diff --git a/fs/proc/base.c b/fs/proc/base.c index 13eda8de2998..d654ce7150fd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -670,10 +670,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /************************************************************************/ /* permission checks */ -static int proc_fd_access_allowed(struct inode *inode) +static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5d66faecd4ef..389e1e42e7d9 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -163,7 +163,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); - pr_cont("/%s\n", entry->procname); + pr_cont("%s\n", entry->procname); return -EEXIST; } } @@ -1020,8 +1020,8 @@ failed: if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); - pr_cont("/%*.*s %ld\n", - namelen, namelen, name, PTR_ERR(subdir)); + pr_cont("%*.*s %ld\n", namelen, namelen, name, + PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) @@ -1053,7 +1053,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_dir *dir; int ret; - ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); @@ -1626,7 +1625,7 @@ static void put_links(struct ctl_table_header *header) else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); - pr_cont("/%s\n", name); + pr_cont("%s\n", name); } } } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 509f85148fee..702754dd1daf 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -65,8 +65,6 @@ static size_t vmcoredd_orig_sz; static DECLARE_RWSEM(vmcore_cb_rwsem); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); -/* Whether we had a surprise unregistration of a callback. */ -static bool vmcore_cb_unstable; /* Whether the vmcore has been opened once. */ static bool vmcore_opened; @@ -94,10 +92,8 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) * very unusual (e.g., forced driver removal), but we cannot stop * unregistering. */ - if (vmcore_opened) { + if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - vmcore_cb_unstable = true; - } up_write(&vmcore_cb_rwsem); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -108,8 +104,6 @@ static bool pfn_is_ram(unsigned long pfn) bool ret = true; lockdep_assert_held_read(&vmcore_cb_rwsem); - if (unlikely(vmcore_cb_unstable)) - return false; list_for_each_entry(cb, &vmcore_cb_list, next) { if (unlikely(!cb->pfn_is_ram)) @@ -581,7 +575,7 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, * looping over all pages without a reason. */ down_read(&vmcore_cb_rwsem); - if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) + if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index c43877c8a279..505533c43a92 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -93,21 +93,6 @@ struct getbmapx { #define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ /* - * Structure for XFS_IOC_FSSETDM. - * For use by backup and restore programs to set the XFS on-disk inode - * fields di_dmevmask and di_dmstate. These must be set to exactly and - * only values previously obtained via xfs_bulkstat! (Specifically the - * struct xfs_bstat fields bs_dmevmask and bs_dmstate.) - */ -#ifndef HAVE_FSDMIDATA -struct fsdmidata { - __u32 fsd_dmevmask; /* corresponds to di_dmevmask */ - __u16 fsd_padding; - __u16 fsd_dmstate; /* corresponds to di_dmstate */ -}; -#endif - -/* * File segment locking set data type for 64 bit access. * Also used for all the RESV/FREE interfaces. */ @@ -562,16 +547,10 @@ typedef struct xfs_fsop_handlereq { /* * Compound structures for passing args through Handle Request interfaces - * xfs_fssetdm_by_handle, xfs_attrlist_by_handle, xfs_attrmulti_by_handle - * - ioctls: XFS_IOC_FSSETDM_BY_HANDLE, XFS_IOC_ATTRLIST_BY_HANDLE, and - * XFS_IOC_ATTRMULTI_BY_HANDLE + * xfs_attrlist_by_handle, xfs_attrmulti_by_handle + * - ioctls: XFS_IOC_ATTRLIST_BY_HANDLE, and XFS_IOC_ATTRMULTI_BY_HANDLE */ -typedef struct xfs_fsop_setdm_handlereq { - struct xfs_fsop_handlereq hreq; /* handle information */ - struct fsdmidata __user *data; /* DMAPI data */ -} xfs_fsop_setdm_handlereq_t; - /* * Flags passed in xfs_attr_multiop.am_flags for the attr ioctl interface. * @@ -781,15 +760,15 @@ struct xfs_scrub_metadata { * For 'documentation' purposed more than anything else, * the "cmd #" field reflects the IRIX fcntl number. */ -#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) -#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +/* XFS_IOC_ALLOCSP ------- deprecated 10 */ +/* XFS_IOC_FREESP -------- deprecated 11 */ #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) #define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR #define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR -#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) -#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) +/* XFS_IOC_ALLOCSP64 ----- deprecated 36 */ +/* XFS_IOC_FREESP64 ------ deprecated 37 */ #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) -#define XFS_IOC_FSSETDM _IOW ('X', 39, struct fsdmidata) +/* XFS_IOC_FSSETDM ------- deprecated 39 */ #define XFS_IOC_RESVSP _IOW ('X', 40, struct xfs_flock64) #define XFS_IOC_UNRESVSP _IOW ('X', 41, struct xfs_flock64) #define XFS_IOC_RESVSP64 _IOW ('X', 42, struct xfs_flock64) @@ -831,7 +810,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */ #define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */ -#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) +/* XFS_IOC_FSSETDM_BY_HANDLE -- deprecated 121 */ #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) #define XFS_IOC_FSGEOMETRY_V4 _IOR ('X', 124, struct xfs_fsop_geom_v4) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 797ea0c8b14e..d4a387d3d0ce 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -771,8 +771,7 @@ int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, - int alloc_type) + xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; @@ -865,8 +864,8 @@ xfs_alloc_file_space( goto error; error = xfs_bmapi_write(tp, ip, startoffset_fsb, - allocatesize_fsb, alloc_type, 0, imapp, - &nimaps); + allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, + &nimaps); if (error) goto error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 9f993168b55b..24b37d211f1d 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -54,7 +54,7 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len, int alloc_type); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8d4c5ca261bd..22ad207bedf4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1051,8 +1051,7 @@ xfs_file_fallocate( } if (!xfs_is_always_cow_inode(ip)) { - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + error = xfs_alloc_file_space(ip, offset, len); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8ea47a9d5aad..03a6198c97f6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -627,87 +627,6 @@ xfs_attrmulti_by_handle( return error; } -int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf) -{ - struct inode *inode = file_inode(filp); - struct xfs_inode *ip = XFS_I(inode); - struct iattr iattr; - enum xfs_prealloc_flags flags = XFS_PREALLOC_CLEAR; - uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - int error; - - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) - return -EPERM; - - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - if (xfs_is_always_cow_inode(ip)) - return -EOPNOTSUPP; - - if (filp->f_flags & O_DSYNC) - flags |= XFS_PREALLOC_SYNC; - if (filp->f_mode & FMODE_NOCMTIME) - flags |= XFS_PREALLOC_INVISIBLE; - - error = mnt_want_write_file(filp); - if (error) - return error; - - xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); - if (error) - goto out_unlock; - inode_dio_wait(inode); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += filp->f_pos; - break; - case 2: /*SEEK_END*/ - bf->l_start += XFS_ISIZE(ip); - break; - default: - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start < 0 || bf->l_start > inode->i_sb->s_maxbytes) { - error = -EINVAL; - goto out_unlock; - } - - if (bf->l_start > XFS_ISIZE(ip)) { - error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), - bf->l_start - XFS_ISIZE(ip), - XFS_BMAPI_PREALLOC); - if (error) - goto out_unlock; - } - - iattr.ia_valid = ATTR_SIZE; - iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), - &iattr); - if (error) - goto out_unlock; - - error = xfs_update_prealloc_flags(ip, flags); - -out_unlock: - xfs_iunlock(ip, iolock); - mnt_drop_write_file(filp); - return error; -} - /* Return 0 on success or positive error */ int xfs_fsbulkstat_one_fmt( @@ -1936,6 +1855,15 @@ xfs_fs_eofblocks_from_user( } /* + * These long-unused ioctls were removed from the official ioctl API in 5.17, + * but retain these definitions so that we can log warnings about them. + */ +#define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) +#define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) +#define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) +#define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) + +/* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. * So we don't "sign flip" like most other routines. This means @@ -1965,13 +1893,11 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: { - xfs_flock64_t bf; - - if (copy_from_user(&bf, arg, sizeof(bf))) - return -EFAULT; - return xfs_ioc_space(filp, &bf); - } + case XFS_IOC_FREESP64: + xfs_warn_once(mp, + "%s should use fallocate; XFS_IOC_{ALLOC,FREE}SP ioctl unsupported", + current->comm); + return -ENOTTY; case XFS_IOC_DIOINFO: { struct xfs_buftarg *target = xfs_inode_buftarg(ip); struct dioattr da; diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 845d3bcab74b..d4abba2c13c1 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -10,12 +10,6 @@ struct xfs_bstat; struct xfs_ibulk; struct xfs_inogrp; - -extern int -xfs_ioc_space( - struct file *filp, - xfs_flock64_t *bf); - int xfs_ioc_swapext( xfs_swapext_t *sxp); diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 8783af203cfc..004ed2a251e8 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -28,22 +28,6 @@ #ifdef BROKEN_X86_ALIGNMENT STATIC int -xfs_compat_flock64_copyin( - xfs_flock64_t *bf, - compat_xfs_flock64_t __user *arg32) -{ - if (get_user(bf->l_type, &arg32->l_type) || - get_user(bf->l_whence, &arg32->l_whence) || - get_user(bf->l_start, &arg32->l_start) || - get_user(bf->l_len, &arg32->l_len) || - get_user(bf->l_sysid, &arg32->l_sysid) || - get_user(bf->l_pid, &arg32->l_pid) || - copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) - return -EFAULT; - return 0; -} - -STATIC int xfs_compat_ioc_fsgeometry_v1( struct xfs_mount *mp, compat_xfs_fsop_geom_v1_t __user *arg32) @@ -445,17 +429,6 @@ xfs_file_compat_ioctl( switch (cmd) { #if defined(BROKEN_X86_ALIGNMENT) - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: { - struct xfs_flock64 bf; - - if (xfs_compat_flock64_copyin(&bf, arg)) - return -EFAULT; - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - return xfs_ioc_space(filp, &bf); - } case XFS_IOC_FSGEOMETRY_V1_32: return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 9929482bf358..fc5a91f3a5e0 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -154,10 +154,6 @@ typedef struct compat_xfs_flock64 { __s32 l_pad[4]; /* reserve area */ } compat_xfs_flock64_t; -#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) -#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) -#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) -#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) #define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) #define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) #define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) |
