From 0a26f327e46c203229e72c823dfec71a2b405ec5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 5 Jan 2023 12:51:45 -0800 Subject: block: make BLK_DEF_MAX_SECTORS unsigned This is used as an unsigned value, so define it that way to avoid having to cast it. Suggested-by: Christoph Hellwig Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20230105205146.3610282-2-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 7d28e3aa406c..4c601ca9552a 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2123,8 +2123,7 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_physical_block_size(nullb->q, dev->blocksize); if (!dev->max_sectors) dev->max_sectors = queue_max_hw_sectors(nullb->q); - dev->max_sectors = min_t(unsigned int, dev->max_sectors, - BLK_DEF_MAX_SECTORS); + dev->max_sectors = min(dev->max_sectors, BLK_DEF_MAX_SECTORS); blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); if (dev->virt_boundary) -- cgit v1.2.3 From 887b98c74fdf9ab44e93ad9166977cbbb766d2c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Fri, 13 Jan 2023 13:35:04 +0100 Subject: drbd: split off drbd_buildtag into separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be more similar to what we do in the out-of-tree module and ease the upstreaming process. Signed-off-by: Christoph Böhmwalder Reviewed-by: Joel Colledge Link: https://lore.kernel.org/r/20230113123506.144082-2-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/Makefile | 2 +- drivers/block/drbd/drbd_buildtag.c | 22 ++++++++++++++++++++++ drivers/block/drbd/drbd_main.c | 18 ------------------ 3 files changed, 23 insertions(+), 19 deletions(-) create mode 100644 drivers/block/drbd/drbd_buildtag.c (limited to 'drivers') diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index c93e462130ff..67a8b352a1d5 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -drbd-y := drbd_bitmap.o drbd_proc.o +drbd-y := drbd_buildtag.o drbd_bitmap.o drbd_proc.o drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o drbd-y += drbd_main.o drbd_strings.o drbd_nl.o drbd-y += drbd_interval.o drbd_state.o diff --git a/drivers/block/drbd/drbd_buildtag.c b/drivers/block/drbd/drbd_buildtag.c new file mode 100644 index 000000000000..956a4d5c339b --- /dev/null +++ b/drivers/block/drbd/drbd_buildtag.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +const char *drbd_buildtag(void) +{ + /* DRBD built from external sources has here a reference to the + * git hash of the source code. + */ + + static char buildtag[38] = "\0uilt-in"; + + if (buildtag[0] == 0) { +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; +#endif + } + + return buildtag; +} diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index e43dfb9eb6ad..af9309175637 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3776,24 +3776,6 @@ _drbd_insert_fault(struct drbd_device *device, unsigned int type) } #endif -const char *drbd_buildtag(void) -{ - /* DRBD built from external sources has here a reference to the - git hash of the source code. */ - - static char buildtag[38] = "\0uilt-in"; - - if (buildtag[0] == 0) { -#ifdef MODULE - sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); -#else - buildtag[0] = 'b'; -#endif - } - - return buildtag; -} - module_init(drbd_init) module_exit(drbd_cleanup) -- cgit v1.2.3 From 4e2da933b9f19d8098374515ee0984a20202e674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Fri, 13 Jan 2023 13:35:05 +0100 Subject: drbd: drop API_VERSION define MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the genetlink api version as defined in drbd_genl_api.h. Signed-off-by: Christoph Böhmwalder Reviewed-by: Joel Colledge Link: https://lore.kernel.org/r/20230113123506.144082-3-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_debugfs.c | 2 +- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_proc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index a72c096aa5b1..12460b584bcb 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -844,7 +844,7 @@ static int drbd_version_show(struct seq_file *m, void *ignored) { seq_printf(m, "# %s\n", drbd_buildtag()); seq_printf(m, "VERSION=%s\n", REL_VERSION); - seq_printf(m, "API_VERSION=%u\n", API_VERSION); + seq_printf(m, "API_VERSION=%u\n", GENL_MAGIC_VERSION); seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN); seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX); return 0; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index af9309175637..2c764f7ee4a7 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2899,7 +2899,7 @@ static int __init drbd_init(void) pr_info("initialized. " "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", - API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); + GENL_MAGIC_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); pr_info("%s\n", drbd_buildtag()); pr_info("registered as block device major %d\n", DRBD_MAJOR); return 0; /* Success! */ diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2227fb0db1ce..1d0feafceadc 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -228,7 +228,7 @@ int drbd_seq_show(struct seq_file *seq, void *v) }; seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", - API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); + GENL_MAGIC_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); /* cs .. connection state -- cgit v1.2.3 From 20f2a34a421b1716b96d1e34d4f4948bf4b4ba1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Fri, 13 Jan 2023 13:35:06 +0100 Subject: drbd: split off drbd_config into separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be more similar to what we do in the out-of-tree module and ease the upstreaming process. Signed-off-by: Christoph Böhmwalder Reviewed-by: Joel Colledge Link: https://lore.kernel.org/r/20230113123506.144082-4-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_buildtag.c | 2 +- drivers/block/drbd/drbd_int.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_buildtag.c b/drivers/block/drbd/drbd_buildtag.c index 956a4d5c339b..cb1aa66d7d5d 100644 --- a/drivers/block/drbd/drbd_buildtag.c +++ b/drivers/block/drbd/drbd_buildtag.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#include +#include #include const char *drbd_buildtag(void) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index ae713338aa46..67b4e86634ec 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -34,6 +34,7 @@ #include #include #include +#include #include "drbd_strings.h" #include "drbd_state.h" #include "drbd_protocol.h" -- cgit v1.2.3 From 069182007d1ad05b6aaadd9f3864c33b279e2685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Fri, 13 Jan 2023 13:35:34 +0100 Subject: drbd: remove unnecessary assignment in vli_encode_bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Christoph Böhmwalder Reviewed-by: Joel Colledge Link: https://lore.kernel.org/r/20230113123538.144276-5-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_vli.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h index 1ee81e3c2152..941c511cc4da 100644 --- a/drivers/block/drbd/drbd_vli.h +++ b/drivers/block/drbd/drbd_vli.h @@ -327,7 +327,7 @@ static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) */ static inline int vli_encode_bits(struct bitstream *bs, u64 in) { - u64 code = code; + u64 code; int bits = __vli_encode_bits(&code, in); if (bits <= 0) -- cgit v1.2.3 From 9cf766a457995a95d7f66d78cf749d05067d68a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Fri, 13 Jan 2023 13:35:35 +0100 Subject: drbd: remove macros using require_context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This require_context attribute originated in a proposed sparse patch by Philipp Reisner back in 2008. Johannes Berg had a different solution to a similar problem, and that patch "won" in the end; so the require_context thing never got merged. The whole history can be read at [0]. DRBD kept using these annotations anyway for a while. Nowadays, on a modern unmodified sparse, they obviously do nothing, and they are hardly used anymore anyway. So, just remove the definitions of these macros. [0] https://www.spinics.net/lists/linux-sparse/msg01150.html Signed-off-by: Christoph Böhmwalder Reviewed-by: Joel Colledge Link: https://lore.kernel.org/r/20230113123538.144276-6-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 67b4e86634ec..d89b7d03d4c8 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -40,16 +40,6 @@ #include "drbd_protocol.h" #include "drbd_polymorph_printk.h" -#ifdef __CHECKER__ -# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) -# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) -# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) -#else -# define __protected_by(x) -# define __protected_read_by(x) -# define __protected_write_by(x) -#endif - /* shared module parameters, defined in drbd_main.c */ #ifdef CONFIG_DRBD_FAULT_INJECTION extern int drbd_enable_faults; @@ -775,7 +765,7 @@ struct drbd_device { unsigned long flags; /* configured by drbdsetup */ - struct drbd_backing_dev *ldev __protected_by(local); + struct drbd_backing_dev *ldev; sector_t p_size; /* partner's disk size */ struct request_queue *rq_queue; -- cgit v1.2.3 From 2990ca29f36171e052ea42d8464ec2e21cf4485a Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 13 Jan 2023 13:35:37 +0100 Subject: drbd: interval tree: make removing an "empty" interval a no-op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to remove an "empty" (just initialized, or "cleared") interval from the tree, this results in an endless loop. As we typically protect the tree with a spinlock_irq, the result is a hung system. Be nice to error cleanup code paths, ignore removal of empty intervals. Signed-off-by: Lars Ellenberg Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20230113123538.144276-8-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_interval.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index 5024ffd6143d..b6aaf0d4d85b 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -95,6 +95,10 @@ drbd_contains_interval(struct rb_root *root, sector_t sector, void drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) { + /* avoid endless loop */ + if (drbd_interval_empty(this)) + return; + rb_erase_augmented(&this->rb, root, &augment_callbacks); } -- cgit v1.2.3 From 2bb34fa6ff4183b42c397866ec2443ab5eabc280 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 13 Jan 2023 13:35:38 +0100 Subject: drbd: drbd_insert_interval(): Clarify comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Andreas Gruenbacher Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20230113123538.144276-9-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_interval.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c index b6aaf0d4d85b..873beda6de24 100644 --- a/drivers/block/drbd/drbd_interval.c +++ b/drivers/block/drbd/drbd_interval.c @@ -58,7 +58,7 @@ drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) * drbd_contains_interval - check if a tree contains a given interval * @root: red black tree root * @sector: start sector of @interval - * @interval: may not be a valid pointer + * @interval: may be an invalid pointer * * Returns if the tree contains the node @interval with start sector @start. * Does not dereference @interval until @interval is known to be a valid object -- cgit v1.2.3 From ed878d1c1c641c4a6bd366658fc8e6bc842b80d1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:06 +0800 Subject: ublk_drv: remove nr_aborted_queues from ublk_device No one uses 'nr_aborted_queues' any more, so remove it. Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e54693204630..4232089e3723 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -159,7 +159,6 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; - atomic_t nr_aborted_queues; /* * Our ubq->daemon may be killed without any notification, so -- cgit v1.2.3 From 73a166d9749230d598320fdae3b687cdc0e2e205 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:07 +0800 Subject: ublk_drv: don't probe partitions if the ubq daemon isn't trusted If any ubq daemon is unprivileged, the ublk char device is allowed for unprivileged user actually, and we can't trust the current user, so not probe partitions. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 4232089e3723..8a6f38cc62db 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -159,6 +159,7 @@ struct ublk_device { struct completion completion; unsigned int nr_queues_ready; + unsigned int nr_privileged_daemon; /* * Our ubq->daemon may be killed without any notification, so @@ -1178,6 +1179,9 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) ubq->ubq_daemon = current; get_task_struct(ubq->ubq_daemon); ub->nr_queues_ready++; + + if (capable(CAP_SYS_ADMIN)) + ub->nr_privileged_daemon++; } if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) complete_all(&ub->completion); @@ -1534,6 +1538,10 @@ static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) if (ret) goto out_put_disk; + /* don't probe partitions if any one ubq daemon is un-trusted */ + if (ub->nr_privileged_daemon != ub->nr_queues_ready) + set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); + get_device(&ub->cdev_dev); ret = add_disk(disk); if (ret) { @@ -1935,6 +1943,7 @@ static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) /* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */ ub->mm = NULL; ub->nr_queues_ready = 0; + ub->nr_privileged_daemon = 0; init_completion(&ub->completion); ret = 0; out_unlock: -- cgit v1.2.3 From bfbcef036396a73fbf4b3fee385cc670159df5ad Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:08 +0800 Subject: ublk_drv: move ublk_get_device_from_id into ublk_ctrl_uring_cmd It is annoying for each control command handler to get/put ublk device and deal with failure. Control command handler is simplified a lot by moving ublk_get_device_from_id into ublk_ctrl_uring_cmd(). Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 138 +++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 89 deletions(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8a6f38cc62db..b015e46b59bb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1496,21 +1496,16 @@ static struct ublk_device *ublk_get_device_from_id(int idx) return ub; } -static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; struct gendisk *disk; int ret = -EINVAL; if (ublksrv_pid <= 0) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - wait_for_completion_interruptible(&ub->completion); schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); @@ -1559,21 +1554,20 @@ out_put_disk: put_disk(disk); out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; cpumask_var_t cpumask; unsigned long queue; unsigned int retlen; unsigned int i; - int ret = -EINVAL; - + int ret; + if (header->len * BITS_PER_BYTE < nr_cpu_ids) return -EINVAL; if (header->len & (sizeof(unsigned long)-1)) @@ -1581,17 +1575,12 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) if (!header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - queue = header->data[0]; if (queue >= ub->dev_info.nr_hw_queues) - goto out_put_device; + return -EINVAL; - ret = -ENOMEM; if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) - goto out_put_device; + return -ENOMEM; for_each_possible_cpu(i) { if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) @@ -1609,8 +1598,6 @@ static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) ret = 0; out_free_cpumask: free_cpumask_var(cpumask); -out_put_device: - ublk_put_device(ub); return ret; } @@ -1731,30 +1718,27 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(int idx) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub) { - struct ublk_device *ub; + struct ublk_device *ub = *p_ub; + int idx = ub->ub_number; int ret; ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; - ub = ublk_get_device_from_id(idx); - if (ub) { - ublk_remove(ub); - ublk_put_device(ub); - ret = 0; - } else { - ret = -ENODEV; - } + ublk_remove(ub); + + /* Mark the reference as consumed */ + *p_ub = NULL; + ublk_put_device(ub); /* * Wait until the idr is removed, then it can be reused after * DEL_DEV command is returned. */ - if (!ret) - wait_event(ublk_idr_wq, ublk_idr_freed(idx)); + wait_event(ublk_idr_wq, ublk_idr_freed(idx)); mutex_unlock(&ublk_ctl_mutex); return ret; @@ -1769,50 +1753,36 @@ static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) header->data[0], header->addr, header->len); } -static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) +static int ublk_ctrl_stop_dev(struct ublk_device *ub) { - struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; - - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - ublk_stop_dev(ub); cancel_work_sync(&ub->stop_work); cancel_work_sync(&ub->quiesce_work); - ublk_put_device(ub); return 0; } -static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_dev_info(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; - struct ublk_device *ub; - int ret = 0; if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) return -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) - ret = -EFAULT; - ublk_put_device(ub); + return -EFAULT; - return ret; + return 0; } -static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) +static int ublk_ctrl_get_params(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret; if (header->len <= sizeof(ph) || !header->addr) @@ -1827,10 +1797,6 @@ static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - mutex_lock(&ub->mutex); if (copy_to_user(argp, &ub->params, ph.len)) ret = -EFAULT; @@ -1838,16 +1804,15 @@ static int ublk_ctrl_get_params(struct io_uring_cmd *cmd) ret = 0; mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) +static int ublk_ctrl_set_params(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_params_header ph; - struct ublk_device *ub; int ret = -EFAULT; if (header->len <= sizeof(ph) || !header->addr) @@ -1862,10 +1827,6 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) if (ph.len > sizeof(struct ublk_params)) ph.len = sizeof(struct ublk_params); - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return -EINVAL; - /* parameters can only be changed when device isn't live */ mutex_lock(&ub->mutex); if (ub->dev_info.state == UBLK_S_DEV_LIVE) { @@ -1878,7 +1839,6 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd) ret = ublk_validate_params(ub); } mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } @@ -1905,17 +1865,13 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) } } -static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) +static int ublk_ctrl_start_recovery(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; - struct ublk_device *ub; int ret = -EINVAL; int i; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; - mutex_lock(&ub->mutex); if (!ublk_can_use_recovery(ub)) goto out_unlock; @@ -1948,21 +1904,16 @@ static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd) ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } -static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) +static int ublk_ctrl_end_recovery(struct ublk_device *ub, + struct io_uring_cmd *cmd) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; int ublksrv_pid = (int)header->data[0]; - struct ublk_device *ub; int ret = -EINVAL; - ub = ublk_get_device_from_id(header->dev_id); - if (!ub) - return ret; - pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n", __func__, ub->dev_info.nr_hw_queues, header->dev_id); /* wait until new ubq_daemon sending all FETCH_REQ */ @@ -1990,7 +1941,6 @@ static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd) ret = 0; out_unlock: mutex_unlock(&ub->mutex); - ublk_put_device(ub); return ret; } @@ -1998,6 +1948,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub = NULL; int ret = -EINVAL; if (issue_flags & IO_URING_F_NONBLOCK) @@ -2012,41 +1963,50 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (!capable(CAP_SYS_ADMIN)) goto out; - ret = -ENODEV; + if (cmd->cmd_op != UBLK_CMD_ADD_DEV) { + ret = -ENODEV; + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + goto out; + } + switch (cmd->cmd_op) { case UBLK_CMD_START_DEV: - ret = ublk_ctrl_start_dev(cmd); + ret = ublk_ctrl_start_dev(ub, cmd); break; case UBLK_CMD_STOP_DEV: - ret = ublk_ctrl_stop_dev(cmd); + ret = ublk_ctrl_stop_dev(ub); break; case UBLK_CMD_GET_DEV_INFO: - ret = ublk_ctrl_get_dev_info(cmd); + ret = ublk_ctrl_get_dev_info(ub, cmd); break; case UBLK_CMD_ADD_DEV: ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(header->dev_id); + ret = ublk_ctrl_del_dev(&ub); break; case UBLK_CMD_GET_QUEUE_AFFINITY: - ret = ublk_ctrl_get_queue_affinity(cmd); + ret = ublk_ctrl_get_queue_affinity(ub, cmd); break; case UBLK_CMD_GET_PARAMS: - ret = ublk_ctrl_get_params(cmd); + ret = ublk_ctrl_get_params(ub, cmd); break; case UBLK_CMD_SET_PARAMS: - ret = ublk_ctrl_set_params(cmd); + ret = ublk_ctrl_set_params(ub, cmd); break; case UBLK_CMD_START_USER_RECOVERY: - ret = ublk_ctrl_start_recovery(cmd); + ret = ublk_ctrl_start_recovery(ub, cmd); break; case UBLK_CMD_END_USER_RECOVERY: - ret = ublk_ctrl_end_recovery(cmd); + ret = ublk_ctrl_end_recovery(ub, cmd); break; default: + ret = -ENOTSUPP; break; } + if (ub) + ublk_put_device(ub); out: io_uring_cmd_done(cmd, ret, 0); pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", -- cgit v1.2.3 From abb864d380854b5427b6b070beb2ebc291ce4d1e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:09 +0800 Subject: ublk_drv: add device parameter UBLK_PARAM_TYPE_DEVT Userspace side only knows device ID, but the associated path of ublkc* and ublkb* could be changed by udev, and that depends on userspace's policy, so add parameter of UBLK_PARAM_TYPE_DEVT for retrieving major/minor of the ublkc* and ublkb*, then user may figure out major/minor of the ublk disks he/she owns. With major/minor, it is easy to find the device node path. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b015e46b59bb..75033304b900 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -54,7 +54,8 @@ | UBLK_F_USER_RECOVERY_REISSUE) /* All UBLK_PARAM_TYPE_* should be included here */ -#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) +#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ + UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT) struct ublk_rq_data { struct llist_node node; @@ -255,6 +256,10 @@ static int ublk_validate_params(const struct ublk_device *ub) return -EINVAL; } + /* dev_t is read-only */ + if (ub->params.types & UBLK_PARAM_TYPE_DEVT) + return -EINVAL; + return 0; } @@ -1777,6 +1782,22 @@ static int ublk_ctrl_get_dev_info(struct ublk_device *ub, return 0; } +/* TYPE_DEVT is readonly, so fill it up before returning to userspace */ +static void ublk_ctrl_fill_params_devt(struct ublk_device *ub) +{ + ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt); + ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt); + + if (ub->ub_disk) { + ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk)); + ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk)); + } else { + ub->params.devt.disk_major = 0; + ub->params.devt.disk_minor = 0; + } + ub->params.types |= UBLK_PARAM_TYPE_DEVT; +} + static int ublk_ctrl_get_params(struct ublk_device *ub, struct io_uring_cmd *cmd) { @@ -1798,6 +1819,7 @@ static int ublk_ctrl_get_params(struct ublk_device *ub, ph.len = sizeof(struct ublk_params); mutex_lock(&ub->mutex); + ublk_ctrl_fill_params_devt(ub); if (copy_to_user(argp, &ub->params, ph.len)) ret = -EFAULT; else -- cgit v1.2.3 From 403ebc877832752da9fc851284fa00ceca7b2fae Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:10 +0800 Subject: ublk_drv: add module parameter of ublks_max for limiting max allowed ublk dev Prepare for supporting unprivileged ublk device by limiting max number ublk devices added. Otherwise too many ublk devices could be added by un-trusted user, which can be thought as one DoS. Reviewed-by: ZiyangZhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 75033304b900..883f7974d105 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -186,6 +186,15 @@ static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ static DEFINE_MUTEX(ublk_ctl_mutex); +/* + * Max ublk devices allowed to add + * + * It can be extended to one per-user limit in future or even controlled + * by cgroup. + */ +static unsigned int ublks_max = 64; +static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ + static struct miscdevice ublk_misc; static void ublk_dev_param_basic_apply(struct ublk_device *ub) @@ -1441,6 +1450,8 @@ static int ublk_add_chdev(struct ublk_device *ub) ret = cdev_device_add(&ub->cdev, dev); if (ret) goto fail; + + ublks_added++; return 0; fail: put_device(dev); @@ -1483,6 +1494,7 @@ static void ublk_remove(struct ublk_device *ub) cancel_work_sync(&ub->quiesce_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); put_device(&ub->cdev_dev); + ublks_added--; } static struct ublk_device *ublk_get_device_from_id(int idx) @@ -1642,6 +1654,10 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) if (ret) return ret; + ret = -EACCES; + if (ublks_added >= ublks_max) + goto out_unlock; + ret = -ENOMEM; ub = kzalloc(sizeof(*ub), GFP_KERNEL); if (!ub) @@ -2095,5 +2111,8 @@ static void __exit ublk_exit(void) module_init(ublk_init); module_exit(ublk_exit); +module_param(ublks_max, int, 0444); +MODULE_PARM_DESC(ublks_max, "max number of ublk devices allowed to add(default: 64)"); + MODULE_AUTHOR("Ming Lei "); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 4093cb5a06343ea3936ae46664d132c82576b153 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Jan 2023 12:17:11 +0800 Subject: ublk_drv: add mechanism for supporting unprivileged ublk device unprivileged ublk device is helpful for container use case, such as: ublk device created in one unprivileged container can be controlled and accessed by this container only. Implement this feature by adding flag of UBLK_F_UNPRIVILEGED_DEV, and if this flag isn't set, any control command has been run from privileged user. Otherwise, any control command can be sent from any unprivileged user, but the user has to be permitted to access the ublk char device to be controlled. In case of UBLK_F_UNPRIVILEGED_DEV: 1) for command UBLK_CMD_ADD_DEV, it is always allowed, and user needs to provide owner's uid/gid in this command, so that udev can set correct ownership for the created ublk device, since the device owner uid/gid can be queried via command of UBLK_CMD_GET_DEV_INFO. 2) for other control commands, they can only be run successfully if the current user is allowed to access the specified ublk char device, for running the permission check, path of the ublk char device has to be provided by these commands. Also add one control of command UBLK_CMD_GET_DEV_INFO2 which always include the char dev path in payload since userspace may not have knowledge if this device is created in unprivileged mode. For applying this mechanism, system administrator needs to take the following policies: 1) chmod 0666 /dev/ublk-control 2) change ownership of ublkcN & ublkbN - chown owner_uid:owner_gid /dev/ublkcN - chown owner_uid:owner_gid /dev/ublkbN Both can be done via one simple udev rule. Userspace: https://github.com/ming1/ubdsrv/tree/unprivileged-ublk 'ublk add -t $TYPE --un_privileged=1' is for creating one un-privileged ublk device if the user is un-privileged. Link: https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/ Suggested-by: Stefan Hajnoczi Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230106041711.914434-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 152 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 146 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 883f7974d105..a725a236a38f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -51,7 +52,8 @@ | UBLK_F_URING_CMD_COMP_IN_TASK \ | UBLK_F_NEED_GET_DATA \ | UBLK_F_USER_RECOVERY \ - | UBLK_F_USER_RECOVERY_REISSUE) + | UBLK_F_USER_RECOVERY_REISSUE \ + | UBLK_F_UNPRIVILEGED_DEV) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ @@ -1618,6 +1620,17 @@ out_free_cpumask: return ret; } +static void ublk_store_owner_uid_gid(struct ublksrv_ctrl_dev_info *info) +{ + kuid_t uid; + kgid_t gid; + + current_uid_gid(&uid, &gid); + + info->owner_uid = from_kuid(&init_user_ns, uid); + info->owner_gid = from_kgid(&init_user_ns, gid); +} + static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) { pr_devel("%s: dev id %d flags %llx\n", __func__, @@ -1641,15 +1654,26 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) __func__, header->queue_id); return -EINVAL; } + if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; - ublk_dump_dev_info(&info); + + if (capable(CAP_SYS_ADMIN)) + info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; + else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) + return -EPERM; + + /* the created device is always owned by current user */ + ublk_store_owner_uid_gid(&info); + if (header->dev_id != info.dev_id) { pr_warn("%s: dev id not match %u %u\n", __func__, header->dev_id, info.dev_id); return -EINVAL; } + ublk_dump_dev_info(&info); + ret = mutex_lock_killable(&ublk_ctl_mutex); if (ret) return ret; @@ -1982,6 +2006,115 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, return ret; } +/* + * All control commands are sent via /dev/ublk-control, so we have to check + * the destination device's permission + */ +static int ublk_char_dev_permission(struct ublk_device *ub, + const char *dev_path, int mask) +{ + int err; + struct path path; + struct kstat stat; + + err = kern_path(dev_path, LOOKUP_FOLLOW, &path); + if (err) + return err; + + err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); + if (err) + goto exit; + + err = -EPERM; + if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode)) + goto exit; + + err = inode_permission(&init_user_ns, + d_backing_inode(path.dentry), mask); +exit: + path_put(&path); + return err; +} + +static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, + struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV; + void __user *argp = (void __user *)(unsigned long)header->addr; + char *dev_path = NULL; + int ret = 0; + int mask; + + if (!unprivileged) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * The new added command of UBLK_CMD_GET_DEV_INFO2 includes + * char_dev_path in payload too, since userspace may not + * know if the specified device is created as unprivileged + * mode. + */ + if (cmd->cmd_op != UBLK_CMD_GET_DEV_INFO2) + return 0; + } + + /* + * User has to provide the char device path for unprivileged ublk + * + * header->addr always points to the dev path buffer, and + * header->dev_path_len records length of dev path buffer. + */ + if (!header->dev_path_len || header->dev_path_len > PATH_MAX) + return -EINVAL; + + if (header->len < header->dev_path_len) + return -EINVAL; + + dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL); + if (!dev_path) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(dev_path, argp, header->dev_path_len)) + goto exit; + dev_path[header->dev_path_len] = 0; + + ret = -EINVAL; + switch (cmd->cmd_op) { + case UBLK_CMD_GET_DEV_INFO: + case UBLK_CMD_GET_DEV_INFO2: + case UBLK_CMD_GET_QUEUE_AFFINITY: + case UBLK_CMD_GET_PARAMS: + mask = MAY_READ; + break; + case UBLK_CMD_START_DEV: + case UBLK_CMD_STOP_DEV: + case UBLK_CMD_ADD_DEV: + case UBLK_CMD_DEL_DEV: + case UBLK_CMD_SET_PARAMS: + case UBLK_CMD_START_USER_RECOVERY: + case UBLK_CMD_END_USER_RECOVERY: + mask = MAY_READ | MAY_WRITE; + break; + default: + goto exit; + } + + ret = ublk_char_dev_permission(ub, dev_path, mask); + if (!ret) { + header->len -= header->dev_path_len; + header->addr += header->dev_path_len; + } + pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n", + __func__, ub->ub_number, cmd->cmd_op, + ub->dev_info.owner_uid, ub->dev_info.owner_gid, + dev_path, ret); +exit: + kfree(dev_path); + return ret; +} + static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -1997,17 +2130,21 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (!(issue_flags & IO_URING_F_SQE128)) goto out; - ret = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out; - if (cmd->cmd_op != UBLK_CMD_ADD_DEV) { ret = -ENODEV; ub = ublk_get_device_from_id(header->dev_id); if (!ub) goto out; + + ret = ublk_ctrl_uring_cmd_permission(ub, cmd); + } else { + /* ADD_DEV permission check is done in command handler */ + ret = 0; } + if (ret) + goto put_dev; + switch (cmd->cmd_op) { case UBLK_CMD_START_DEV: ret = ublk_ctrl_start_dev(ub, cmd); @@ -2016,6 +2153,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_stop_dev(ub); break; case UBLK_CMD_GET_DEV_INFO: + case UBLK_CMD_GET_DEV_INFO2: ret = ublk_ctrl_get_dev_info(ub, cmd); break; case UBLK_CMD_ADD_DEV: @@ -2043,6 +2181,8 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = -ENOTSUPP; break; } + + put_dev: if (ub) ublk_put_device(ub); out: -- cgit v1.2.3 From 888545cb43d7639457883dc325c303a3737c0a64 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Tue, 17 Jan 2023 17:36:37 +0530 Subject: nvme: set REQ_ALLOC_CACHE for uring-passthru request This patch sets REQ_ALLOC_CACHE flag for uring-passthru requests. This is a prep-patch so that normal / IRQ-driven uring-passthru I/Os can also leverage bio-cache. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20230117120638.72254-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 06f52db34be9..ffaabf16dd4c 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -554,7 +554,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_uring_data d; struct nvme_command c; struct request *req; - blk_opf_t rq_flags = 0; + blk_opf_t rq_flags = REQ_ALLOC_CACHE; blk_mq_req_flags_t blk_flags = 0; void *meta = NULL; int ret; @@ -590,7 +590,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, d.timeout_ms = READ_ONCE(cmd->timeout_ms); if (issue_flags & IO_URING_F_NONBLOCK) { - rq_flags = REQ_NOWAIT; + rq_flags |= REQ_NOWAIT; blk_flags = BLK_MQ_REQ_NOWAIT; } if (issue_flags & IO_URING_F_IOPOLL) -- cgit v1.2.3 From d67ea690ce0983dadf59cd06facc18f3acc89cea Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Tue, 10 Jan 2023 15:36:35 +0100 Subject: block: introduce bdev_zone_no helper Add a generic bdev_zone_no() helper to calculate zone number for a given sector in a block device. This helper internally uses disk_zone_no() to find the zone number. Use the helper bdev_zone_no() to calculate nr of zones. This lets us make modifications to the math if needed in one place. Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20230110143635.77300-4-p.raghav@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/target/zns.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 1254cf57e008..7e4292d88016 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -254,8 +254,7 @@ static unsigned long nvmet_req_nr_zones_from_slba(struct nvmet_req *req) { unsigned int sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba); - return bdev_nr_zones(req->ns->bdev) - - (sect >> ilog2(bdev_zone_sectors(req->ns->bdev))); + return bdev_nr_zones(req->ns->bdev) - bdev_zone_no(req->ns->bdev, sect); } static unsigned long get_nr_zones_from_buf(struct nvmet_req *req, u32 bufsize) -- cgit v1.2.3 From 1bf7a749efdc5183e83239e030596ef74b9c8b13 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Jan 2023 08:47:18 +0100 Subject: ps3vram: remove bio splitting ps3vram iterates over the bio one segment, that is page aligned and max page sized chunk, a time. Because of that there is no point in calling bio_split_to_limits, or explicitly setting the default limits that are only used by bio_split_to_limits. Signed-off-by: Christoph Hellwig Tested-by: Geoff Levand Link: https://lore.kernel.org/r/20230123074718.57951-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ps3vram.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 574e470b220b..38d42af01b25 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -586,10 +586,6 @@ static void ps3vram_submit_bio(struct bio *bio) dev_dbg(&dev->core, "%s\n", __func__); - bio = bio_split_to_limits(bio); - if (!bio) - return; - spin_lock_irq(&priv->lock); busy = !bio_list_empty(&priv->list); bio_list_add(&priv->list, bio); @@ -749,9 +745,6 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) gendisk->private_data = dev; strscpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); - blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS); - blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE); - blk_queue_max_hw_sectors(gendisk->queue, BLK_SAFE_MAX_SECTORS); dev_info(&dev->core, "%s: Using %llu MiB of GPU memory\n", gendisk->disk_name, get_capacity(gendisk) >> 11); -- cgit v1.2.3 From 9607cd36bb23e1c3271a394407c0caf4f7a207ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Jan 2023 08:53:56 +0100 Subject: s390/dcssblk:: don't call bio_split_to_limits s390 iterates over the bio using bio_for_each_segment and doesn't need any bio splitting. Signed-off-by: Christoph Hellwig Acked-by: Alexander Gordeev Link: https://lore.kernel.org/r/20230123075356.60847-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dcssblk.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index c0f85ffb2b62..c09f2e053bf8 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -864,10 +864,6 @@ dcssblk_submit_bio(struct bio *bio) unsigned long source_addr; unsigned long bytes_done; - bio = bio_split_to_limits(bio); - if (!bio) - return; - bytes_done = 0; dev_info = bio->bi_bdev->bd_disk->private_data; if (dev_info == NULL) -- cgit v1.2.3 From 48a9051980242128f844defe44c7e83217f79872 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Jan 2023 12:04:46 +0800 Subject: ublk_drv: only allow owner to open unprivileged disk Owner of one unprivileged ublk device could be one evil user, which can grant this disk's privilege to other users deliberately, and this way could be like making one trap and waiting for other users to be caught. So only owner to open unprivileged disk even though the owner grants disk privilege to other user. This way is reasonable too given anyone can create ublk disk, and no need other's grant. Reported-by: Stefan Hajnoczi Fixes: 4093cb5a0634 ("ublk_drv: add mechanism for supporting unprivileged ublk device") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230131040446.214583-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 55 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a725a236a38f..c932e9ea5a0f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -377,8 +377,50 @@ static void ublk_free_disk(struct gendisk *disk) put_device(&ub->cdev_dev); } +static void ublk_store_owner_uid_gid(unsigned int *owner_uid, + unsigned int *owner_gid) +{ + kuid_t uid; + kgid_t gid; + + current_uid_gid(&uid, &gid); + + *owner_uid = from_kuid(&init_user_ns, uid); + *owner_gid = from_kgid(&init_user_ns, gid); +} + +static int ublk_open(struct block_device *bdev, fmode_t mode) +{ + struct ublk_device *ub = bdev->bd_disk->private_data; + + if (capable(CAP_SYS_ADMIN)) + return 0; + + /* + * If it is one unprivileged device, only owner can open + * the disk. Otherwise it could be one trap made by one + * evil user who grants this disk's privileges to other + * users deliberately. + * + * This way is reasonable too given anyone can create + * unprivileged device, and no need other's grant. + */ + if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) { + unsigned int curr_uid, curr_gid; + + ublk_store_owner_uid_gid(&curr_uid, &curr_gid); + + if (curr_uid != ub->dev_info.owner_uid || curr_gid != + ub->dev_info.owner_gid) + return -EPERM; + } + + return 0; +} + static const struct block_device_operations ub_fops = { .owner = THIS_MODULE, + .open = ublk_open, .free_disk = ublk_free_disk, }; @@ -1620,17 +1662,6 @@ out_free_cpumask: return ret; } -static void ublk_store_owner_uid_gid(struct ublksrv_ctrl_dev_info *info) -{ - kuid_t uid; - kgid_t gid; - - current_uid_gid(&uid, &gid); - - info->owner_uid = from_kuid(&init_user_ns, uid); - info->owner_gid = from_kgid(&init_user_ns, gid); -} - static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) { pr_devel("%s: dev id %d flags %llx\n", __func__, @@ -1664,7 +1695,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) return -EPERM; /* the created device is always owned by current user */ - ublk_store_owner_uid_gid(&info); + ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); if (header->dev_id != info.dev_id) { pr_warn("%s: dev id not match %u %u\n", -- cgit v1.2.3 From 0686fb3cc535c4c1685553c56f4efe3eeee57b09 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 8 Dec 2022 08:49:20 +0100 Subject: nvme-fabrics: clarify AUTHREQ result handling The NVMe 2.0 spec defines the ATR and ASCR bits in the AUTHREQ connect response field to be mutually exclusive. So to clarify the handling here switch the AUTHREQ handling to use the bit definitions and check for both bits. And while we're at it, add a message to the user that secure concatenation is not supported (yet). Suggested-by: Mark Lehrer Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index ce27276f552d..bbaa04a0c502 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -410,7 +410,14 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) result = le32_to_cpu(res.u32); ctrl->cntlid = result & 0xFFFF; - if ((result >> 16) & 0x3) { + if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { + /* Secure concatenation is not implemented */ + if (result & NVME_CONNECT_AUTHREQ_ASCR) { + dev_warn(ctrl->device, + "qid 0: secure concatenation is not supported\n"); + ret = NVME_SC_AUTH_REQUIRED; + goto out_free_data; + } /* Authentication required */ ret = nvme_auth_negotiate(ctrl, 0); if (ret) { @@ -486,7 +493,14 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) &cmd, data); } result = le32_to_cpu(res.u32); - if ((result >> 16) & 2) { + if (result & (NVME_CONNECT_AUTHREQ_ATR | NVME_CONNECT_AUTHREQ_ASCR)) { + /* Secure concatenation is not implemented */ + if (result & NVME_CONNECT_AUTHREQ_ASCR) { + dev_warn(ctrl->device, + "qid 0: secure concatenation is not supported\n"); + ret = NVME_SC_AUTH_REQUIRED; + goto out_free_data; + } /* Authentication required */ ret = nvme_auth_negotiate(ctrl, qid); if (ret) { @@ -500,6 +514,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) "qid %u: authentication failed\n", qid); } } +out_free_data: kfree(data); return ret; } -- cgit v1.2.3 From b0ef1b11d3909d8f246dd3af9c94e38880d349b0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 13 Dec 2022 20:00:26 +0100 Subject: nvme-auth: don't use NVMe status codes NVMe status codes are part of the wire protocol, and shouldn't be fabricated in the stack. So with this patch the authentication code is switched over to use error codes; as a side effect authentication failures due to internal error won't be retried anymore. But that shouldn't have happened anyway. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 4424f53a8a0a..787537454f7f 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -158,7 +158,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; - return NVME_SC_INVALID_FIELD; + return -EINVAL; } hmac_name = nvme_auth_hmac_name(data->hashid); @@ -167,7 +167,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, "qid %d: invalid HASH ID %d\n", chap->qid, data->hashid); chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } if (chap->hash_id == data->hashid && chap->shash_tfm && @@ -193,7 +193,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, chap->qid, hmac_name, PTR_ERR(chap->shash_tfm)); chap->shash_tfm = NULL; chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return NVME_SC_AUTH_REQUIRED; + return -ENOMEM; } if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) { @@ -203,7 +203,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, crypto_free_shash(chap->shash_tfm); chap->shash_tfm = NULL; chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; - return NVME_SC_AUTH_REQUIRED; + return -EPROTO; } chap->hash_id = data->hashid; @@ -219,7 +219,7 @@ select_kpp: chap->qid, data->dhgid); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; /* Leave previous dh_tfm intact */ - return NVME_SC_AUTH_REQUIRED; + return -EPROTO; } if (chap->dhgroup_id == data->dhgid && @@ -242,7 +242,7 @@ select_kpp: "qid %d: empty DH value\n", chap->qid); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0); @@ -254,7 +254,7 @@ select_kpp: chap->qid, ret, gid_name); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; chap->dh_tfm = NULL; - return NVME_SC_AUTH_REQUIRED; + return -ret; } dev_dbg(ctrl->device, "qid %d: selected DH group %s\n", chap->qid, gid_name); @@ -263,7 +263,7 @@ select_kpp: "qid %d: invalid DH value for NULL DH\n", chap->qid); chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } chap->dhgroup_id = data->dhgid; @@ -274,7 +274,7 @@ skip_kpp: chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL); if (!chap->ctrl_key) { chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return NVME_SC_AUTH_REQUIRED; + return -ENOMEM; } chap->ctrl_key_len = dhvlen; memcpy(chap->ctrl_key, data->cval + chap->hash_len, @@ -344,7 +344,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; - return NVME_SC_INVALID_FIELD; + return -EINVAL; } if (data->hl != chap->hash_len) { @@ -352,7 +352,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, "qid %d: invalid hash length %u\n", chap->qid, data->hl); chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; - return NVME_SC_INVALID_FIELD; + return -EPROTO; } /* Just print out information for the admin queue */ @@ -376,7 +376,7 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, "qid %d: controller authentication failed\n", chap->qid); chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return NVME_SC_AUTH_REQUIRED; + return -ECONNREFUSED; } /* Just print out information for the admin queue */ @@ -730,7 +730,7 @@ static void nvme_queue_auth_work(struct work_struct *work) NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE); if (ret) { chap->status = ret; - chap->error = NVME_SC_AUTH_REQUIRED; + chap->error = -ECONNREFUSED; return; } @@ -798,7 +798,7 @@ static void nvme_queue_auth_work(struct work_struct *work) NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1); if (ret) { chap->status = ret; - chap->error = NVME_SC_AUTH_REQUIRED; + chap->error = -ECONNREFUSED; return; } @@ -819,7 +819,7 @@ static void nvme_queue_auth_work(struct work_struct *work) ret = nvme_auth_process_dhchap_success1(ctrl, chap); if (ret) { /* Controller authentication failed */ - chap->error = NVME_SC_AUTH_REQUIRED; + chap->error = -ECONNREFUSED; goto fail2; } -- cgit v1.2.3 From 01df742d8c5c070f92fa9c046907506a2a0f7a9f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 5 Jan 2023 12:28:30 -0800 Subject: nvme-pci: remove SGL segment descriptors The max segments this driver can see is 127, well below the 256 threshold needed to add an nvme sgl segment descriptor. Remove all the useless checks and dead code. Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 47 +++++------------------------------------------ 1 file changed, 5 insertions(+), 42 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c734934c407c..9d6d171afa66 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -548,22 +548,6 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) } } -static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) -{ - const int last_sg = SGES_PER_PAGE - 1; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; - int i; - - for (i = 0; i < iod->nr_allocations; i++) { - struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; - dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); - - dma_pool_free(dev->prp_page_pool, sg_list, dma_addr); - dma_addr = next_dma_addr; - } -} - static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -582,7 +566,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], iod->first_dma); else if (iod->use_sgl) - nvme_free_sgls(dev, req); + dma_pool_free(dev->prp_page_pool, nvme_pci_iod_list(req)[0], + iod->first_dma); else nvme_free_prps(dev, req); mempool_free(iod->sgt.sgl, dev->iod_mempool); @@ -705,13 +690,8 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, dma_addr_t dma_addr, int entries) { sge->addr = cpu_to_le64(dma_addr); - if (entries < SGES_PER_PAGE) { - sge->length = cpu_to_le32(entries * sizeof(*sge)); - sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; - } else { - sge->length = cpu_to_le32(NVME_CTRL_PAGE_SIZE); - sge->type = NVME_SGL_FMT_SEG_DESC << 4; - } + sge->length = cpu_to_le32(entries * sizeof(*sge)); + sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, @@ -751,30 +731,12 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); - do { - if (i == SGES_PER_PAGE) { - struct nvme_sgl_desc *old_sg_desc = sg_list; - struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; - - sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); - if (!sg_list) - goto free_sgls; - - i = 0; - nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list; - sg_list[i++] = *link; - nvme_pci_sgl_set_seg(link, sgl_dma, entries); - } - nvme_pci_sgl_set_data(&sg_list[i++], sg); sg = sg_next(sg); } while (--entries > 0); return BLK_STS_OK; -free_sgls: - nvme_free_sgls(dev, req); - return BLK_STS_RESOURCE; } static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev, @@ -3532,6 +3494,7 @@ static int __init nvme_init(void) BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) > S8_MAX); + BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); return pci_register_driver(&nvme_driver); } -- cgit v1.2.3 From ae5829350324a6bd93c1d9fc3ac67afc0c647e32 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 5 Jan 2023 12:28:31 -0800 Subject: nvme-pci: use mapped entries for sgl decision The driver uses the dma entries for setting up its command's SGL/PRP lists. The dma mapping might have fewer entries than the physical segments, so check the dma mapped count to determine which nvme data layout method is more optimal. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9d6d171afa66..5951a516158c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -515,10 +515,10 @@ static void **nvme_pci_iod_list(struct request *req) return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req)); } -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, + int nseg) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; - int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); @@ -818,7 +818,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_free_sg; } - iod->use_sgl = nvme_pci_use_sgls(dev, req); + iod->use_sgl = nvme_pci_use_sgls(dev, req, iod->sgt.nents); if (iod->use_sgl) ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); else -- cgit v1.2.3 From 7846c1b5a5db8bb8475603069df7c7af034fd081 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 5 Jan 2023 12:28:32 -0800 Subject: nvme-pci: place descriptor addresses in iod The 'struct nvme_iod' space is appended at the end of the preallocated 'struct request', and padded to the cache line size. This leaves some free memory (in most kernel configs) up for grabs. Instead of appending the nvme data descriptor addresses after the scatterlist, inline these for free within struct nvme_iod. There is now enough space in the mempool for 128 possibe segments. And without increasing the size of the preallocated requests, we can hold up to 5 PRP descriptor elements, allowing the driver to increase its max transfer size to 8MB. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 49 ++++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5951a516158c..a331fbfa9a66 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -42,8 +42,9 @@ * These can be higher, but we need to ensure that any command doesn't * require an sg allocation that needs more than a page of data. */ -#define NVME_MAX_KB_SZ 4096 -#define NVME_MAX_SEGS 127 +#define NVME_MAX_KB_SZ 8192 +#define NVME_MAX_SEGS 128 +#define NVME_MAX_NR_ALLOCATIONS 5 static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0444); @@ -215,6 +216,11 @@ struct nvme_queue { struct completion delete_done; }; +union nvme_descriptor { + struct nvme_sgl_desc *sg_list; + __le64 *prp_list; +}; + /* * The nvme_iod describes the data in an I/O. * @@ -232,6 +238,7 @@ struct nvme_iod { dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; + union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; }; static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev) @@ -386,16 +393,6 @@ static int nvme_pci_npages_prp(void) return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } -/* - * Calculates the number of pages needed for the SGL segments. For example a 4k - * page can accommodate 256 SGL descriptors. - */ -static int nvme_pci_npages_sgl(void) -{ - return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), - NVME_CTRL_PAGE_SIZE); -} - static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { @@ -509,12 +506,6 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) spin_unlock(&nvmeq->sq_lock); } -static void **nvme_pci_iod_list(struct request *req) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (void **)(iod->sgt.sgl + blk_rq_nr_phys_segments(req)); -} - static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, int nseg) { @@ -540,7 +531,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) int i; for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = nvme_pci_iod_list(req)[i]; + __le64 *prp_list = iod->list[i].prp_list; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); @@ -563,10 +554,10 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); if (iod->nr_allocations == 0) - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, iod->first_dma); else if (iod->use_sgl) - dma_pool_free(dev->prp_page_pool, nvme_pci_iod_list(req)[0], + dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, iod->first_dma); else nvme_free_prps(dev, req); @@ -598,7 +589,6 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, u64 dma_addr = sg_dma_address(sg); int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); __le64 *prp_list; - void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; @@ -636,7 +626,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, iod->nr_allocations = -1; return BLK_STS_RESOURCE; } - list[0] = prp_list; + iod->list[0].prp_list = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { @@ -645,7 +635,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; - list[iod->nr_allocations++] = prp_list; + iod->list[iod->nr_allocations++].prp_list = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); i = 1; @@ -727,7 +717,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, return BLK_STS_RESOURCE; } - nvme_pci_iod_list(req)[0] = sg_list; + iod->list[0].sg_list = sg_list; iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); @@ -2666,11 +2656,8 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { - size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl()); - size_t alloc_size = sizeof(__le64 *) * npages + - sizeof(struct scatterlist) * NVME_MAX_SEGS; + size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; - WARN_ON_ONCE(alloc_size > PAGE_SIZE); dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *)alloc_size, GFP_KERNEL, @@ -3492,9 +3479,9 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); - BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) > - S8_MAX); BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE); + BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE); + BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); return pci_register_driver(&nvme_driver); } -- cgit v1.2.3 From 62281b9ed671bee71737b42cb72f3c140ac2aef1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Dec 2022 10:13:16 +0100 Subject: nvme: remove nvme_execute_passthru_rq After moving the nvme_passthru_end call to the callers of nvme_execute_passthru_rq, this function has become quite pointless, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Jens Axboe Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 18 ++++-------------- drivers/nvme/host/ioctl.c | 5 +++-- drivers/nvme/host/nvme.h | 3 ++- drivers/nvme/target/passthru.c | 5 +++-- 4 files changed, 12 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 505e16f20e57..564ce60bad14 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1004,7 +1004,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); * >0: nvme controller's cqe status response * <0: kernel error in lieu of controller response */ -static int nvme_execute_rq(struct request *rq, bool at_head) +int nvme_execute_rq(struct request *rq, bool at_head) { blk_status_t status; @@ -1015,6 +1015,7 @@ static int nvme_execute_rq(struct request *rq, bool at_head) return nvme_req(rq)->status; return blk_status_to_errno(status); } +EXPORT_SYMBOL_NS_GPL(nvme_execute_rq, NVME_TARGET_PASSTHRU); /* * Returns 0 on success. If the result is negative, it's a Linux error code; @@ -1116,8 +1117,7 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) } EXPORT_SYMBOL_NS_GPL(nvme_command_effects, NVME_TARGET_PASSTHRU); -static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - u8 opcode) +u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) { u32 effects = nvme_command_effects(ctrl, ns, opcode); @@ -1135,6 +1135,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } return effects; } +EXPORT_SYMBOL_NS_GPL(nvme_passthru_start, NVME_TARGET_PASSTHRU); void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, struct nvme_command *cmd, int status) @@ -1176,17 +1177,6 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, } EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU); -int nvme_execute_passthru_rq(struct request *rq, u32 *effects) -{ - struct nvme_command *cmd = nvme_req(rq)->cmd; - struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; - struct nvme_ns *ns = rq->q->queuedata; - - *effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - return nvme_execute_rq(rq, false); -} -EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); - /* * Recommended frequency for KATO commands per NVMe 1.4 section 7.12.1: * diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index ffaabf16dd4c..723e7d5b778f 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -219,6 +219,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u64 *result, unsigned timeout, unsigned int flags) { + struct nvme_ns *ns = q->queuedata; struct nvme_ctrl *ctrl; struct request *req; void *meta = NULL; @@ -241,8 +242,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, bio = req->bio; ctrl = nvme_req(req)->ctrl; - ret = nvme_execute_passthru_rq(req, &effects); - + effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); + ret = nvme_execute_rq(req, false); if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); if (meta) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 424c8a467a0c..c7c45fdb3b48 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1070,7 +1070,8 @@ static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); -int nvme_execute_passthru_rq(struct request *rq, u32 *effects); +u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); +int nvme_execute_rq(struct request *rq, bool at_head); void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, struct nvme_command *cmd, int status); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index adc0958755d6..511c980d538d 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -216,11 +216,12 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w) struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); struct request *rq = req->p.rq; struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; + struct nvme_ns *ns = rq->q->queuedata; u32 effects; int status; - status = nvme_execute_passthru_rq(rq, &effects); - + effects = nvme_passthru_start(ctrl, ns, req->cmd->common.opcode); + status = nvme_execute_rq(rq, false); if (status == NVME_SC_SUCCESS && req->cmd->common.opcode == nvme_admin_identify) { switch (req->cmd->identify.cns) { -- cgit v1.2.3 From 567da14d46aa726ee05749278355517489e01331 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Mon, 12 Dec 2022 21:40:35 +0200 Subject: nvme: add nvme_opcode_str function for all nvme cmd types nvme_opcode_str will handle io/admin/fabrics ops This improves NVMe errors logging Signed-off-by: Amit Engel Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/constants.c | 16 ++++++++++++++++ drivers/nvme/host/nvme.h | 13 +++++++++++++ 2 files changed, 29 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index e958d5015585..bc523ca02254 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -54,6 +54,14 @@ static const char * const nvme_admin_ops[] = { [nvme_admin_get_lba_status] = "Get LBA Status", }; +static const char * const nvme_fabrics_ops[] = { + [nvme_fabrics_type_property_set] = "Property Set", + [nvme_fabrics_type_property_get] = "Property Get", + [nvme_fabrics_type_connect] = "Connect", + [nvme_fabrics_type_auth_send] = "Authentication Send", + [nvme_fabrics_type_auth_receive] = "Authentication Receive", +}; + static const char * const nvme_statuses[] = { [NVME_SC_SUCCESS] = "Success", [NVME_SC_INVALID_OPCODE] = "Invalid Command Opcode", @@ -185,3 +193,11 @@ const unsigned char *nvme_get_admin_opcode_str(u8 opcode) return nvme_admin_ops[opcode]; return "Unknown"; } +EXPORT_SYMBOL_GPL(nvme_get_admin_opcode_str); + +const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode) { + if (opcode < ARRAY_SIZE(nvme_fabrics_ops) && nvme_fabrics_ops[opcode]) + return nvme_fabrics_ops[opcode]; + return "Unknown"; +} +EXPORT_SYMBOL_GPL(nvme_get_fabrics_opcode_str); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c7c45fdb3b48..bf46f122e9e1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1087,6 +1087,7 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) const unsigned char *nvme_get_error_status_str(u16 status); const unsigned char *nvme_get_opcode_str(u8 opcode); const unsigned char *nvme_get_admin_opcode_str(u8 opcode); +const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode); #else /* CONFIG_NVME_VERBOSE_ERRORS */ static inline const unsigned char *nvme_get_error_status_str(u16 status) { @@ -1100,6 +1101,18 @@ static inline const unsigned char *nvme_get_admin_opcode_str(u8 opcode) { return "Admin Cmd"; } + +static inline const unsigned char *nvme_get_fabrics_opcode_str(u8 opcode) +{ + return "Fabrics Cmd"; +} #endif /* CONFIG_NVME_VERBOSE_ERRORS */ +static inline const unsigned char *nvme_opcode_str(int qid, u8 opcode, u8 fctype) +{ + if (opcode == nvme_fabrics_command) + return nvme_get_fabrics_opcode_str(fctype); + return qid ? nvme_get_opcode_str(opcode) : + nvme_get_admin_opcode_str(opcode); +} #endif /* _NVME_H */ -- cgit v1.2.3 From 99607843e7ed8ffeffc1010bd8001087db756535 Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Mon, 12 Dec 2022 21:40:36 +0200 Subject: nvme-tcp: add additional info for nvme_tcp_timeout log This provides additional details about the rq/cmd that is timed out example log if CONFIG_NVME_VERBOSE_ERRORS is configured: "nvme nvme0: queue 2 timeout cid 0xd058 type 4 opc Write (0x1)" example log if CONFIG_NVME_VERBOSE_ERRORS is not configured: "nvme nvme0: queue 2 timeout cid 0xd058 type 4 opc I/O Cmd (0x1)" Signed-off-by: Amit Engel Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8cedc1ef496c..d6100a787d39 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2282,10 +2282,13 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; struct nvme_tcp_cmd_pdu *pdu = req->pdu; + u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype; + int qid = nvme_tcp_queue_id(req->queue); dev_warn(ctrl->device, - "queue %d: timeout request %#x type %d\n", - nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); + "queue %d: timeout cid %#x type %d opcode %#x (%s)\n", + nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type, + opc, nvme_opcode_str(qid, opc, fctype)); if (ctrl->state != NVME_CTRL_LIVE) { /* -- cgit v1.2.3 From ddf91717693f1ac524329a88f86ab09e66b18cbe Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Wed, 4 Jan 2023 10:44:32 +0200 Subject: nvmet: for nvme admin set_features cmd, call nvmet_check_data_len_lte() This is due to the fact that the host is allowed to pass the controller an sgl describing a buffer that is larger than the payload itself Signed-off-by: Amit Engel Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 6a54ed6fb121..80099df37314 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -840,7 +840,7 @@ void nvmet_execute_set_features(struct nvmet_req *req) u16 nsqr; u16 ncqr; - if (!nvmet_check_transfer_len(req, 0)) + if (!nvmet_check_data_len_lte(req, 0)) return; switch (cdw10 & 0xff) { -- cgit v1.2.3 From cc115cbe12d932b2f081038bf32c815add2b20d7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 27 Jan 2023 08:56:19 -0800 Subject: nvme: always initialize known command effects Instead of appending command effects flags per IO, set the known effects flags the driver needs to react to just once during initial setup. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 84 ++++++++++++++++++++++++++---------------------- 1 file changed, 45 insertions(+), 39 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 564ce60bad14..df929ba9bcc2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1061,41 +1061,12 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); -static u32 nvme_known_admin_effects(u8 opcode) -{ - switch (opcode) { - case nvme_admin_format_nvm: - return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_NCC | - NVME_CMD_EFFECTS_CSE_MASK; - case nvme_admin_sanitize_nvm: - return NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK; - default: - break; - } - return 0; -} - -static u32 nvme_known_nvm_effects(u8 opcode) -{ - switch (opcode) { - case nvme_cmd_write: - case nvme_cmd_write_zeroes: - case nvme_cmd_write_uncor: - return NVME_CMD_EFFECTS_LBCC; - default: - return 0; - } -} - u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) { u32 effects = 0; if (ns) { - if (ns->head->effects) - effects = le32_to_cpu(ns->head->effects->iocs[opcode]); - if (ns->head->ids.csi == NVME_CSI_NVM) - effects |= nvme_known_nvm_effects(opcode); + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn_once(ctrl->device, "IO command:%02x has unusual effects:%08x\n", @@ -1108,9 +1079,7 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) */ effects &= ~NVME_CMD_EFFECTS_CSE_MASK; } else { - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); + effects = le32_to_cpu(ctrl->effects->acs[opcode]); } return effects; @@ -3112,6 +3081,45 @@ free_data: return ret; } +static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl) +{ + struct nvme_effects_log *log = ctrl->effects; + + log->acs[nvme_admin_format_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_NCC | + NVME_CMD_EFFECTS_CSE_MASK); + log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK); + + log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); + log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); + log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); +} + +static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + int ret = 0; + + if (ctrl->effects) + return 0; + + if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); + if (ret < 0) + return ret; + } + + if (!ctrl->effects) { + ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (!ctrl->effects) + return -ENOMEM; + xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL); + } + + nvme_init_known_nvm_effects(ctrl); + return 0; +} + static int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; @@ -3125,12 +3133,6 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } - if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); - if (ret < 0) - goto out_free; - } - if (!(ctrl->ops->flags & NVME_F_FABRICS)) ctrl->cntlid = le16_to_cpu(id->cntlid); @@ -3153,6 +3155,10 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ret = nvme_init_subsystem(ctrl, id); if (ret) goto out_free; + + ret = nvme_init_effects(ctrl, id); + if (ret) + goto out_free; } memcpy(ctrl->subsys->firmware_rev, id->fr, sizeof(ctrl->subsys->firmware_rev)); -- cgit v1.2.3 From baff6491448b487e920faaa117e432989cbafa89 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 27 Jan 2023 08:56:20 -0800 Subject: nvme: mask CSE effects for security receive The nvme driver will freeze the IO queues in response to an admin command with CSE bits set. These bits notify the host that the command that's about to be executed needs to be done exclusively, hence the freeze. The Security Receive command is often reported by multiple vendors with CSE bits set. The reason for this is that the result depends on the previous Security Send. This has nothing to do with IO queues, though, so the driver is taking an overly cautious response to seeing this passthrough command, while unable to fufill the intended admin queue action. Rather than freeze IO during this harmless command, mask off the effects. This freezing is observed to cause IO latency spikes when host software periodically validates the security state of the drives. Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index df929ba9bcc2..d1c9402389f9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3091,6 +3091,23 @@ static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl) log->acs[nvme_admin_sanitize_nvm] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK); + /* + * The spec says the result of a security receive command depends on + * the previous security send command. As such, many vendors log this + * command as one to submitted only when no other commands to the same + * namespace are outstanding. The intention is to tell the host to + * prevent mixing security send and receive. + * + * This driver can only enforce such exclusive access against IO + * queues, though. We are not readily able to enforce such a rule for + * two commands to the admin queue, which is the only queue that + * matters for this command. + * + * Rather than blindly freezing the IO queues for this effect that + * doesn't even apply to IO, mask it off. + */ + log->acs[nvme_admin_security_recv] &= ~NVME_CMD_EFFECTS_CSE_MASK; + log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); log->iocs[nvme_cmd_write_uncor] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); -- cgit v1.2.3 From e152a05fa054170c05f1d5e04e93e2e75ea11405 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Jan 2023 13:13:47 -0800 Subject: loop: Improve the hw_queue_depth kernel module parameter implementation Make the following minor changes which were reported by colleagues while reviewing this code: - Remove the parentheses from around the LOOP_DEFAULT_HW_Q_DEPTH definition since these are superfluous. - Accept other number formats than decimal, e.g. hexadecimal. - Do not set hw_queue_depth to an out-of-range value, even if that value won't be used. - Use the LOOP_DEFAULT_HW_Q_DEPTH macro in the kernel module parameter description to prevent that the description gets out of sync. This patch has been tested as follows: # modprobe -r loop # modprobe loop hw_queue_depth=-1 modprobe: ERROR: could not insert 'loop': Invalid argument # modprobe loop hw_queue_depth=0 modprobe: ERROR: could not insert 'loop': Invalid argument # modprobe loop hw_queue_depth=1; cat /sys/module/loop/parameters/hw_queue_depth 1 # modprobe -r loop; modprobe loop; cat /sys/module/loop/parameters/hw_queue_depth hw_queue_depth=0x10 16 # modprobe -r loop; modprobe loop; cat /sys/module/loop/parameters/hw_queue_depth hw_queue_depth=128 128 # modprobe -r loop; modprobe loop hw_queue_depth=129; cat /sys/module/loop/parameters/hw_queue_depth 129 # modprobe -r loop; modprobe loop hw_queue_depth=$((1<<32)) modprobe: ERROR: could not insert 'loop': Numerical result out of range See also commit ef44c50837ab ("loop: allow user to set the queue depth"). Cc: Chaitanya Kulkarni Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20230130211347.832110-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/block/loop.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1518a6423279..5f04235e4ff7 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -90,7 +90,7 @@ struct loop_cmd { }; #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) -#define LOOP_DEFAULT_HW_Q_DEPTH (128) +#define LOOP_DEFAULT_HW_Q_DEPTH 128 static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -1792,9 +1792,15 @@ static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) { - int ret = kstrtoint(s, 10, &hw_queue_depth); + int qd, ret; - return (ret || (hw_queue_depth < 1)) ? -EINVAL : 0; + ret = kstrtoint(s, 0, &qd); + if (ret < 0) + return ret; + if (qd < 1) + return -EINVAL; + hw_queue_depth = qd; + return 0; } static const struct kernel_param_ops loop_hw_qdepth_param_ops = { @@ -1803,7 +1809,7 @@ static const struct kernel_param_ops loop_hw_qdepth_param_ops = { }; device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); -MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 128"); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: " __stringify(LOOP_DEFAULT_HW_Q_DEPTH)); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); -- cgit v1.2.3 From 1d1f25bfda432a6b61bd0205d426226bbbd73504 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 31 Jan 2023 15:07:19 +0800 Subject: md: don't update recovery_cp when curr_resync is ACTIVE Don't update recovery_cp when curr_resync is MD_RESYNC_ACTIVE, otherwise md may skip the resync of the first 3 sectors if the resync procedure is interrupted before the first calling of ->sync_request() as shown below: md_do_sync thread control thread // setup resync mddev->recovery_cp = 0 j = 0 mddev->curr_resync = MD_RESYNC_ACTIVE // e.g., set array as idle set_bit(MD_RECOVERY_INTR, &&mddev_recovery) // resync loop // check INTR before calling sync_request !test_bit(MD_RECOVERY_INTR, &mddev->recovery // resync interrupted // update recovery_cp from 0 to 3 // the resync of three 3 sectors will be skipped mddev->recovery_cp = 3 Fixes: eac58d08d493 ("md: Use enum for overloaded magic numbers used by mddev->curr_resync") Cc: stable@vger.kernel.org # 6.0+ Signed-off-by: Hou Tao Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 02b0240e7c71..272cc5d14906 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9030,7 +9030,7 @@ void md_do_sync(struct md_thread *thread) mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync >= MD_RESYNC_ACTIVE) { + mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { -- cgit v1.2.3 From d19329133d25ad3dc32f8a62635692cb2f189014 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Tue, 31 Jan 2023 13:17:09 +0800 Subject: md: Factor out is_md_suspended helper This helper function will be used in next patch. It's easy for understanding. Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 272cc5d14906..5ec9fdd5e668 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -380,6 +380,13 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); +static bool is_md_suspended(struct mddev *mddev) +{ + if (mddev->suspended) + return true; + else + return false; +} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -389,7 +396,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); */ static bool is_suspended(struct mddev *mddev, struct bio *bio) { - if (mddev->suspended) + if (is_md_suspended(mddev)) return true; if (bio_data_dir(bio) != WRITE) return false; @@ -434,7 +441,7 @@ check_suspended: goto check_suspended; } - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + if (atomic_dec_and_test(&mddev->active_io) && is_md_suspended(mddev)) wake_up(&mddev->sb_wait); } EXPORT_SYMBOL(md_handle_request); @@ -6219,7 +6226,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); - if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) { + if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } @@ -8531,7 +8538,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) return true; wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || - mddev->suspended); + is_md_suspended(mddev)); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; @@ -9259,7 +9266,7 @@ void md_check_recovery(struct mddev *mddev) wake_up(&mddev->sb_wait); } - if (mddev->suspended) + if (is_md_suspended(mddev)) return; if (mddev->bitmap) -- cgit v1.2.3 From 72adae23a72cb12e2ef0dcd7c0aa042867f27998 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Tue, 31 Jan 2023 13:17:10 +0800 Subject: md: Change active_io to percpu Now the type of active_io is atomic. It's used to count how many ios are in the submitting process and it's added and decreased very time. But it only needs to check if it's zero when suspending the raid. So we can switch atomic to percpu to improve the performance. After switching active_io to percpu type, we use the state of active_io to judge if the raid device is suspended. And we don't need to wake up ->sb_wait in md_handle_request anymore. It's done in the callback function which is registered when initing active_io. The argument mddev->suspended is only used to count how many users are trying to set raid to suspend state. Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 43 ++++++++++++++++++++++++------------------- drivers/md/md.h | 2 +- 2 files changed, 25 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5ec9fdd5e668..da6370835c47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -382,10 +382,7 @@ static DEFINE_SPINLOCK(all_mddevs_lock); static bool is_md_suspended(struct mddev *mddev) { - if (mddev->suspended) - return true; - else - return false; + return percpu_ref_is_dying(&mddev->active_io); } /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is @@ -412,12 +409,10 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: - rcu_read_lock(); if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { - rcu_read_unlock(); bio_wouldblock_error(bio); return; } @@ -426,23 +421,19 @@ check_suspended: TASK_UNINTERRUPTIBLE); if (!is_suspended(mddev, bio)) break; - rcu_read_unlock(); schedule(); - rcu_read_lock(); } finish_wait(&mddev->sb_wait, &__wait); } - atomic_inc(&mddev->active_io); - rcu_read_unlock(); + if (!percpu_ref_tryget_live(&mddev->active_io)) + goto check_suspended; if (!mddev->pers->make_request(mddev, bio)) { - atomic_dec(&mddev->active_io); - wake_up(&mddev->sb_wait); + percpu_ref_put(&mddev->active_io); goto check_suspended; } - if (atomic_dec_and_test(&mddev->active_io) && is_md_suspended(mddev)) - wake_up(&mddev->sb_wait); + percpu_ref_put(&mddev->active_io); } EXPORT_SYMBOL(md_handle_request); @@ -490,11 +481,10 @@ void mddev_suspend(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; - synchronize_rcu(); wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); - smp_mb__after_atomic(); - wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); + percpu_ref_kill(&mddev->active_io); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); @@ -512,6 +502,7 @@ void mddev_resume(struct mddev *mddev) lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; + percpu_ref_resurrect(&mddev->active_io); wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 0); @@ -690,7 +681,6 @@ void mddev_init(struct mddev *mddev) timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); - atomic_set(&mddev->active_io, 0); spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); @@ -5767,6 +5757,12 @@ static void md_safemode_timeout(struct timer_list *t) } static int start_dirty_degraded; +static void active_io_release(struct percpu_ref *ref) +{ + struct mddev *mddev = container_of(ref, struct mddev, active_io); + + wake_up(&mddev->sb_wait); +} int md_run(struct mddev *mddev) { @@ -5847,10 +5843,15 @@ int md_run(struct mddev *mddev) nowait = nowait && bdev_nowait(rdev->bdev); } + err = percpu_ref_init(&mddev->active_io, active_io_release, + PERCPU_REF_ALLOW_REINIT, GFP_KERNEL); + if (err) + return err; + if (!bioset_initialized(&mddev->bio_set)) { err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - return err; + goto exit_active_io; } if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); @@ -6038,6 +6039,8 @@ abort: bioset_exit(&mddev->sync_set); exit_bio_set: bioset_exit(&mddev->bio_set); +exit_active_io: + percpu_ref_exit(&mddev->active_io); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6262,6 +6265,7 @@ void md_stop(struct mddev *mddev) */ __md_stop_writes(mddev); __md_stop(mddev); + percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); } @@ -7835,6 +7839,7 @@ static void md_free_disk(struct gendisk *disk) struct mddev *mddev = disk->private_data; percpu_ref_exit(&mddev->writes_pending); + percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); diff --git a/drivers/md/md.h b/drivers/md/md.h index 554a9026669a..6335cb86e52e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -315,7 +315,7 @@ struct mddev { unsigned long sb_flags; int suspended; - atomic_t active_io; + struct percpu_ref active_io; int ro; int sysfs_active; /* set when sysfs deletes * are happening, so run/ -- cgit v1.2.3 From 07dbb13542cc022677b64acc6e0bd0d8a2cbf4dc Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Sat, 21 Jan 2023 09:48:10 +0800 Subject: md: Free writes_pending in md_stop dm raid calls md_stop to stop the raid device. It needs to free the writes_pending here. Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index da6370835c47..0cf340243ddb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6265,6 +6265,7 @@ void md_stop(struct mddev *mddev) */ __md_stop_writes(mddev); __md_stop(mddev); + percpu_ref_exit(&mddev->writes_pending); percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); -- cgit v1.2.3 From ed821cf84e7b969fb5b63598c89d3428a30d8d31 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 1 Feb 2023 15:59:20 +0800 Subject: md: use MD_RESYNC_* whenever possible Just replace magic numbers by MD_RESYNC_* enumerations. Signed-off-by: Hou Tao Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/md.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0cf340243ddb..1961105712b7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6166,7 +6166,7 @@ static void md_clean(struct mddev *mddev) mddev->new_level = LEVEL_NONE; mddev->new_layout = 0; mddev->new_chunk_sectors = 0; - mddev->curr_resync = 0; + mddev->curr_resync = MD_RESYNC_NONE; atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; @@ -8896,7 +8896,7 @@ void md_do_sync(struct md_thread *thread) atomic_set(&mddev->recovery_active, 0); last_check = 0; - if (j>2) { + if (j >= MD_RESYNC_ACTIVE) { pr_debug("md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; @@ -8968,7 +8968,7 @@ void md_do_sync(struct md_thread *thread) if (j > max_sectors) /* when skipping, extra large numbers can be returned. */ j = max_sectors; - if (j > 2) + if (j >= MD_RESYNC_ACTIVE) mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) -- cgit v1.2.3 From f1e117cbb01a38f764db2f292174b93eab7c2db2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:15 +0100 Subject: sd: factor out a sd_set_special_bvec helper Add a helper for setting up the special_bvec instead of open coding it in three place, and use the new bvec_set_page helper to initialize special_vec. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20230203150634.3199647-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 47dafe6b8a66..277960decc10 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -831,6 +831,19 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); } +static void *sd_set_special_bvec(struct request *rq, unsigned int data_len) +{ + struct page *page; + + page = mempool_alloc(sd_page_pool, GFP_ATOMIC); + if (!page) + return NULL; + clear_highpage(page); + bvec_set_page(&rq->special_vec, page, data_len, 0); + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + return bvec_virt(&rq->special_vec); +} + static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; @@ -841,19 +854,14 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) unsigned int data_len = 24; char *buf; - rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); - if (!rq->special_vec.bv_page) + buf = sd_set_special_bvec(rq, data_len); + if (!buf) return BLK_STS_RESOURCE; - clear_highpage(rq->special_vec.bv_page); - rq->special_vec.bv_offset = 0; - rq->special_vec.bv_len = data_len; - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; - buf = bvec_virt(&rq->special_vec); put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(lba, &buf[8]); @@ -876,13 +884,8 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; - rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); - if (!rq->special_vec.bv_page) + if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; - clear_highpage(rq->special_vec.bv_page); - rq->special_vec.bv_offset = 0; - rq->special_vec.bv_len = data_len; - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; @@ -908,13 +911,8 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; - rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); - if (!rq->special_vec.bv_page) + if (!sd_set_special_bvec(rq, data_len)) return BLK_STS_RESOURCE; - clear_highpage(rq->special_vec.bv_page); - rq->special_vec.bv_offset = 0; - rq->special_vec.bv_len = data_len; - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; -- cgit v1.2.3 From 3c7ebe952fefb646c56b60f1c3e3388f3b938cc7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:16 +0100 Subject: target: use bvec_set_page to initialize bvecs Use the bvec_set_page helper to initialize bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20230203150634.3199647-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/target/target_core_file.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index fd584111da45..ce0e000b74fc 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -281,10 +281,8 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; for_each_sg(sgl, sg, sgl_nents, i) { - aio_cmd->bvecs[i].bv_page = sg_page(sg); - aio_cmd->bvecs[i].bv_len = sg->length; - aio_cmd->bvecs[i].bv_offset = sg->offset; - + bvec_set_page(&aio_cmd->bvecs[i], sg_page(sg), sg->length, + sg->offset); len += sg->length; } @@ -329,10 +327,7 @@ static int fd_do_rw(struct se_cmd *cmd, struct file *fd, } for_each_sg(sgl, sg, sgl_nents, i) { - bvec[i].bv_page = sg_page(sg); - bvec[i].bv_len = sg->length; - bvec[i].bv_offset = sg->offset; - + bvec_set_page(&bvec[i], sg_page(sg), sg->length, sg->offset); len += sg->length; } @@ -465,10 +460,9 @@ fd_execute_write_same(struct se_cmd *cmd) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; for (i = 0; i < nolb; i++) { - bvec[i].bv_page = sg_page(&cmd->t_data_sg[0]); - bvec[i].bv_len = cmd->t_data_sg[0].length; - bvec[i].bv_offset = cmd->t_data_sg[0].offset; - + bvec_set_page(&bvec[i], sg_page(&cmd->t_data_sg[0]), + cmd->t_data_sg[0].length, + cmd->t_data_sg[0].offset); len += se_dev->dev_attrib.block_size; } -- cgit v1.2.3 From fc41c97a3a7b08131e6998bc7692f95729f9d359 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:17 +0100 Subject: nvmet: use bvec_set_page to initialize bvecs Use the bvec_set_page helper to initialize bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230203150634.3199647-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd-file.c | 10 ++-------- drivers/nvme/target/tcp.c | 5 ++--- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 871c4f32f443..2d068439b129 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -73,13 +73,6 @@ err: return ret; } -static void nvmet_file_init_bvec(struct bio_vec *bv, struct scatterlist *sg) -{ - bv->bv_page = sg_page(sg); - bv->bv_offset = sg->offset; - bv->bv_len = sg->length; -} - static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, unsigned long nr_segs, size_t count, int ki_flags) { @@ -146,7 +139,8 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags) memset(&req->f.iocb, 0, sizeof(struct kiocb)); for_each_sg(req->sg, sg, req->sg_cnt, i) { - nvmet_file_init_bvec(&req->f.bvec[bv_cnt], sg); + bvec_set_page(&req->f.bvec[bv_cnt], sg_page(sg), sg->length, + sg->offset); len += req->f.bvec[bv_cnt].bv_len; total_len += req->f.bvec[bv_cnt].bv_len; bv_cnt++; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index cc05c094de22..c5759eb503d0 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -321,9 +321,8 @@ static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) while (length) { u32 iov_len = min_t(u32, length, sg->length - sg_offset); - iov->bv_page = sg_page(sg); - iov->bv_len = sg->length; - iov->bv_offset = sg->offset + sg_offset; + bvec_set_page(iov, sg_page(sg), sg->length, + sg->offset + sg_offset); length -= iov_len; sg = sg_next(sg); -- cgit v1.2.3 From 4bee16daf13225d6b109bb95d613fd691b04a757 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:18 +0100 Subject: nvme: use bvec_set_virt to initialize special_vec Use the bvec_set_virt helper to initialize the special_vec. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230203150634.3199647-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 505e16f20e57..7ba1accc3c22 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -806,9 +806,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, cmnd->dsm.nr = cpu_to_le32(segments - 1); cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - req->special_vec.bv_page = virt_to_page(range); - req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = alloc_size; + bvec_set_virt(&req->special_vec, range, alloc_size); req->rq_flags |= RQF_SPECIAL_PAYLOAD; return BLK_STS_OK; -- cgit v1.2.3 From 7df2af0bb4912cf360045d065f88fe4ed2f702ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:19 +0100 Subject: rbd: use bvec_set_page to initialize the copy up bvec Use the bvec_set_page helper to initialize the copy up bvec. Signed-off-by: Christoph Hellwig Reviewed-by: Ilya Dryomov Link: https://lore.kernel.org/r/20230203150634.3199647-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 04453f4a319c..1faca7e07a4d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3068,13 +3068,12 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) for (i = 0; i < obj_req->copyup_bvec_count; i++) { unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); + struct page *page = alloc_page(GFP_NOIO); - obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); - if (!obj_req->copyup_bvecs[i].bv_page) + if (!page) return -ENOMEM; - obj_req->copyup_bvecs[i].bv_offset = 0; - obj_req->copyup_bvecs[i].bv_len = len; + bvec_set_page(&obj_req->copyup_bvecs[i], page, len, 0); obj_overlap -= len; } -- cgit v1.2.3 From b831f3a1031664ae2443bab63d35c416ed30c91d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:20 +0100 Subject: virtio_blk: use bvec_set_virt to initialize special_vec Use the bvec_set_virt helper to initialize the special_vec. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://lore.kernel.org/r/20230203150634.3199647-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6a77fa917428..dc6e9b989910 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -170,9 +170,7 @@ static int virtblk_setup_discard_write_zeroes_erase(struct request *req, bool un WARN_ON_ONCE(n != segments); - req->special_vec.bv_page = virt_to_page(range); - req->special_vec.bv_offset = offset_in_page(range); - req->special_vec.bv_len = sizeof(*range) * segments; + bvec_set_virt(&req->special_vec, range, sizeof(*range) * segments); req->rq_flags |= RQF_SPECIAL_PAYLOAD; return 0; -- cgit v1.2.3 From 13ae4db0c05107814db4e774856aa83e72e8bf04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:21 +0100 Subject: zram: use bvec_set_page to initialize bvecs Use the bvec_set_page helper to initialize bvecs. Signed-off-by: Christoph Hellwig Reviewed-by: Sergey Senozhatsky Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230203150634.3199647-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e290d6d97047..bd8ae4822dc3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -703,9 +703,7 @@ static ssize_t writeback_store(struct device *dev, for (; nr_pages != 0; index++, nr_pages--) { struct bio_vec bvec; - bvec.bv_page = page; - bvec.bv_len = PAGE_SIZE; - bvec.bv_offset = 0; + bvec_set_page(&bvec, page, PAGE_SIZE, 0); spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { @@ -1380,12 +1378,9 @@ out: static int zram_bvec_read_from_bdev(struct zram *zram, struct page *page, u32 index, struct bio *bio, bool partial_io) { - struct bio_vec bvec = { - .bv_page = page, - .bv_len = PAGE_SIZE, - .bv_offset = 0, - }; + struct bio_vec bvec; + bvec_set_page(&bvec, page, PAGE_SIZE, 0); return read_from_bdev(zram, &bvec, zram_get_element(zram, index), bio, partial_io); } @@ -1652,9 +1647,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, memcpy_from_bvec(dst + offset, bvec); kunmap_atomic(dst); - vec.bv_page = page; - vec.bv_len = PAGE_SIZE; - vec.bv_offset = 0; + bvec_set_page(&vec, page, PAGE_SIZE, 0); } ret = __zram_bvec_write(zram, &vec, index, bio); -- cgit v1.2.3 From 58dfe14073846e416d5b3595314a4f37e1a89c50 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Feb 2023 16:06:33 +0100 Subject: vringh: use bvec_set_page to initialize a bvec Use the bvec_set_page helper to initialize a bvec. Signed-off-by: Christoph Hellwig Acked-by: Jason Wang Link: https://lore.kernel.org/r/20230203150634.3199647-23-hch@lst.de Signed-off-by: Jens Axboe --- drivers/vhost/vringh.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 33eb941fcf15..a1e27da54481 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1126,9 +1126,8 @@ static int iotlb_translate(const struct vringh *vrh, size = map->size - addr + map->start; pa = map->addr + addr - map->start; pfn = pa >> PAGE_SHIFT; - iov[ret].bv_page = pfn_to_page(pfn); - iov[ret].bv_len = min(len - s, size); - iov[ret].bv_offset = pa & (PAGE_SIZE - 1); + bvec_set_page(&iov[ret], pfn_to_page(pfn), min(len - s, size), + pa & (PAGE_SIZE - 1)); s += size; addr += size; ++ret; -- cgit v1.2.3 From 731e208d7b4b38d2bac4b7c53403c8abbf306d01 Mon Sep 17 00:00:00 2001 From: Ziyang Zhang Date: Tue, 7 Feb 2023 15:08:37 +0800 Subject: ublk: remove unnecessary NULL check in ublk_rq_has_data() bio_has_data() allows a NULL bio so the NULL check in ublk_rq_has_data() is unnecessary. Signed-off-by: Ziyang Zhang Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230207070839.370817-2-ZiyangZhang@linux.alibaba.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c932e9ea5a0f..55fccce68a9c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -322,7 +322,7 @@ static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, static inline bool ublk_rq_has_data(const struct request *rq) { - return rq->bio && bio_has_data(rq->bio); + return bio_has_data(rq->bio); } static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, -- cgit v1.2.3 From b352389e7ba34bdb5bcf4254fa1e85319ba76352 Mon Sep 17 00:00:00 2001 From: Ziyang Zhang Date: Tue, 7 Feb 2023 15:08:38 +0800 Subject: ublk: mention WRITE_ZEROES in comment of ublk_complete_rq() WRITE_ZEROES won't return bytes returned just like FLUSH and DISCARD, and we can end it directly. Add missing comment for it in ublk_complete_rq(). Signed-off-by: Ziyang Zhang Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230207070839.370817-3-ZiyangZhang@linux.alibaba.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 55fccce68a9c..06eddefdf02a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -665,7 +665,7 @@ static void ublk_complete_rq(struct request *req) } /* - * FLUSH or DISCARD usually won't return bytes returned, so end them + * FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them * directly. * * Both the two needn't unmap. -- cgit v1.2.3 From 1972d038a5401781377d3ce2d901bf7763a43589 Mon Sep 17 00:00:00 2001 From: Ziyang Zhang Date: Tue, 7 Feb 2023 15:08:39 +0800 Subject: ublk: pass NULL to blk_mq_alloc_disk() as queuedata queuedata is not referenced in ublk_drv and we can use driver_data instead. Pass NULL to blk_mq_alloc_disk() as queuedata while allocating ublk's gendisk. Signed-off-by: Ziyang Zhang Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230207070839.370817-4-ZiyangZhang@linux.alibaba.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 06eddefdf02a..d83fe2c2b3ba 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1578,7 +1578,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, ub); + disk = blk_mq_alloc_disk(&ub->tag_set, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; -- cgit v1.2.3 From 0abe39dec065133e3f92a52219c3728fe7d7617f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 7 Feb 2023 23:07:00 +0800 Subject: block: ublk: improve handling device deletion Inside ublk_ctrl_del_dev(), when the device is removed, we wait until the device number is freed with holding global lock of ublk_ctl_mutex, this way isn't friendly from user viewpoint: 1) if device is in-use, the current delete command hangs in ublk_ctrl_del_dev(), and user can't break from the handling because wait_event() is used 2) global lock is held, so any new device can't be added and other old devices can't be removed. Improve the deleting handling by the following way, suggested by Nadav: 1) wait without holding the global lock 2) replace wait_event() with wait_event_interruptible() Reported-by: Nadav Amit Suggested-by: Nadav Amit Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230207150700.545530-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d83fe2c2b3ba..e6eceee44366 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -150,6 +150,7 @@ struct ublk_device { #define UB_STATE_OPEN 0 #define UB_STATE_USED 1 +#define UB_STATE_DELETED 2 unsigned long state; int ub_number; @@ -1804,20 +1805,33 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) if (ret) return ret; - ublk_remove(ub); + if (!test_bit(UB_STATE_DELETED, &ub->state)) { + ublk_remove(ub); + set_bit(UB_STATE_DELETED, &ub->state); + } /* Mark the reference as consumed */ *p_ub = NULL; ublk_put_device(ub); + mutex_unlock(&ublk_ctl_mutex); /* * Wait until the idr is removed, then it can be reused after * DEL_DEV command is returned. + * + * If we returns because of user interrupt, future delete command + * may come: + * + * - the device number isn't freed, this device won't or needn't + * be deleted again, since UB_STATE_DELETED is set, and device + * will be released after the last reference is dropped + * + * - the device number is freed already, we will not find this + * device via ublk_get_device_from_id() */ - wait_event(ublk_idr_wq, ublk_idr_freed(idx)); - mutex_unlock(&ublk_ctl_mutex); + wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)); - return ret; + return 0; } static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) -- cgit v1.2.3 From 76fed01420bb8b0e282745a4945925b25751d42b Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 3 Feb 2023 13:13:44 +0800 Subject: md: account io_acct_set usage with active_io io_acct_set was enabled for raid0/raid5 io accounting. bios that contain md_io_acct are allocated in the i/o path. There isn't a good method to monitor if these bios are all finished and freed. In the takeover process, io_acct_set (which is used for bios with md_io_acct) need to be freed. However, if some bios finish after io_acct_set is freed, it may trigger the following panic: [ 6973.767999] RIP: 0010:mempool_free+0x52/0x80 [ 6973.786098] Call Trace: [ 6973.786549] md_end_io_acct+0x31/0x40 [ 6973.787227] blk_update_request+0x224/0x380 [ 6973.787994] blk_mq_end_request+0x1a/0x130 [ 6973.788739] blk_complete_reqs+0x35/0x50 [ 6973.789456] __do_softirq+0xd7/0x2c8 [ 6973.790114] ? sort_range+0x20/0x20 [ 6973.790763] run_ksoftirqd+0x2a/0x40 [ 6973.791400] smpboot_thread_fn+0xb5/0x150 [ 6973.792114] kthread+0x10b/0x130 [ 6973.792724] ? set_kthread_struct+0x50/0x50 [ 6973.793491] ret_from_fork+0x1f/0x40 Fix this by increasing and decreasing active_io for each bio with md_io_acct so that mddev_suspend() will wait until all bios from io_acct_set finish before freeing io_acct_set. Reported-by: Fine Fan Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 6 ++++++ drivers/md/md.h | 7 ++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 1961105712b7..927a43db5dfb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8628,12 +8628,15 @@ static void md_end_io_acct(struct bio *bio) { struct md_io_acct *md_io_acct = bio->bi_private; struct bio *orig_bio = md_io_acct->orig_bio; + struct mddev *mddev = md_io_acct->mddev; orig_bio->bi_status = bio->bi_status; bio_end_io_acct(orig_bio, md_io_acct->start_time); bio_put(bio); bio_endio(orig_bio); + + percpu_ref_put(&mddev->active_io); } /* @@ -8649,10 +8652,13 @@ void md_account_bio(struct mddev *mddev, struct bio **bio) if (!blk_queue_io_stat(bdev->bd_disk->queue)) return; + percpu_ref_get(&mddev->active_io); + clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set); md_io_acct = container_of(clone, struct md_io_acct, bio_clone); md_io_acct->orig_bio = *bio; md_io_acct->start_time = bio_start_io_acct(*bio); + md_io_acct->mddev = mddev; clone->bi_end_io = md_end_io_acct; clone->bi_private = md_io_acct; diff --git a/drivers/md/md.h b/drivers/md/md.h index 6335cb86e52e..e148e3c83b0d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -710,9 +710,10 @@ struct md_thread { }; struct md_io_acct { - struct bio *orig_bio; - unsigned long start_time; - struct bio bio_clone; + struct mddev *mddev; + struct bio *orig_bio; + unsigned long start_time; + struct bio bio_clone; }; #define THREAD_WAKEUP 0 -- cgit v1.2.3 From b87c52e431adfe2dfe8634216b317b4a952aa9fc Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 10 Feb 2023 01:02:52 +0100 Subject: s390/dasd: sort out physical vs virtual pointers usage This does not fix a real bug, since virtual addresses are currently indentical to physical ones. Signed-off-by: Alexander Gordeev Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20230210000253.1644903-2-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd.c | 5 +- drivers/s390/block/dasd_3990_erp.c | 10 ++-- drivers/s390/block/dasd_alias.c | 6 +-- drivers/s390/block/dasd_eckd.c | 100 ++++++++++++++++++------------------- drivers/s390/block/dasd_eer.c | 2 +- drivers/s390/block/dasd_fba.c | 14 +++--- 6 files changed, 67 insertions(+), 70 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 5a6d9c15395f..a9c2a8d76c45 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3978,7 +3978,7 @@ static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device, ccw = cqr->cpaddr; ccw->cmd_code = CCW_CMD_RDC; - ccw->cda = (__u32)(addr_t) cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); ccw->flags = 0; ccw->count = rdc_buffer_size; cqr->startdev = device; @@ -4022,8 +4022,7 @@ char *dasd_get_sense(struct irb *irb) if (scsw_is_tm(&irb->scsw) && (irb->scsw.tm.fcxs == 0x01)) { if (irb->scsw.tm.tcw) - tsb = tcw_get_tsb((struct tcw *)(unsigned long) - irb->scsw.tm.tcw); + tsb = tcw_get_tsb(phys_to_virt(irb->scsw.tm.tcw)); if (tsb && tsb->length == 64 && tsb->flags) switch (tsb->flags & 0x07) { case 1: /* tsa_iostat */ diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index 81d283b3cd3b..9fd36c468706 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -220,7 +220,7 @@ dasd_3990_erp_DCTL(struct dasd_ccw_req * erp, char modifier) memset(ccw, 0, sizeof(struct ccw1)); ccw->cmd_code = CCW_CMD_DCTL; ccw->count = 4; - ccw->cda = (__u32)(addr_t) DCTL_data; + ccw->cda = (__u32)virt_to_phys(DCTL_data); dctl_cqr->flags = erp->flags; dctl_cqr->function = dasd_3990_erp_DCTL; dctl_cqr->refers = erp; @@ -1714,7 +1714,7 @@ dasd_3990_erp_action_1B_32(struct dasd_ccw_req * default_erp, char *sense) ccw->cmd_code = DASD_ECKD_CCW_DEFINE_EXTENT; ccw->flags = CCW_FLAG_CC; ccw->count = 16; - ccw->cda = (__u32)(addr_t) DE_data; + ccw->cda = (__u32)virt_to_phys(DE_data); /* create LO ccw */ ccw++; @@ -1722,7 +1722,7 @@ dasd_3990_erp_action_1B_32(struct dasd_ccw_req * default_erp, char *sense) ccw->cmd_code = DASD_ECKD_CCW_LOCATE_RECORD; ccw->flags = CCW_FLAG_CC; ccw->count = 16; - ccw->cda = (__u32)(addr_t) LO_data; + ccw->cda = (__u32)virt_to_phys(LO_data); /* TIC to the failed ccw */ ccw++; @@ -2419,7 +2419,7 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr) tcw = erp->cpaddr; tsb = (struct tsb *) &tcw[1]; *tcw = *((struct tcw *)cqr->cpaddr); - tcw->tsb = (long)tsb; + tcw->tsb = virt_to_phys(tsb); } else if (ccw->cmd_code == DASD_ECKD_CCW_PSF) { /* PSF cannot be chained from NOOP/TIC */ erp->cpaddr = cqr->cpaddr; @@ -2430,7 +2430,7 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr) ccw->flags = CCW_FLAG_CC; ccw++; ccw->cmd_code = CCW_CMD_TIC; - ccw->cda = (long)(cqr->cpaddr); + ccw->cda = (__u32)virt_to_phys(cqr->cpaddr); } erp->flags = cqr->flags; diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index b6b938aa6615..c9740ae88d1a 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -443,7 +443,7 @@ static int read_unit_address_configuration(struct dasd_device *device, ccw->cmd_code = DASD_ECKD_CCW_PSF; ccw->count = sizeof(struct dasd_psf_prssd_data); ccw->flags |= CCW_FLAG_CC; - ccw->cda = (__u32)(addr_t) prssdp; + ccw->cda = (__u32)virt_to_phys(prssdp); /* Read Subsystem Data - feature codes */ memset(lcu->uac, 0, sizeof(*(lcu->uac))); @@ -451,7 +451,7 @@ static int read_unit_address_configuration(struct dasd_device *device, ccw++; ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = sizeof(*(lcu->uac)); - ccw->cda = (__u32)(addr_t) lcu->uac; + ccw->cda = (__u32)virt_to_phys(lcu->uac); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; @@ -747,7 +747,7 @@ static int reset_summary_unit_check(struct alias_lcu *lcu, ccw->cmd_code = DASD_ECKD_CCW_RSCK; ccw->flags = CCW_FLAG_SLI; ccw->count = 16; - ccw->cda = (__u32)(addr_t) cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); ((char *)cqr->data)[0] = reason; clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 5d0b9991e91a..03ecd8846378 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -288,7 +288,7 @@ define_extent(struct ccw1 *ccw, struct DE_eckd_data *data, unsigned int trk, ccw->cmd_code = DASD_ECKD_CCW_DEFINE_EXTENT; ccw->flags = 0; ccw->count = 16; - ccw->cda = (__u32)__pa(data); + ccw->cda = (__u32)virt_to_phys(data); } memset(data, 0, sizeof(struct DE_eckd_data)); @@ -398,7 +398,7 @@ static void locate_record_ext(struct ccw1 *ccw, struct LRE_eckd_data *data, ccw->count = 22; else ccw->count = 20; - ccw->cda = (__u32)__pa(data); + ccw->cda = (__u32)virt_to_phys(data); } memset(data, 0, sizeof(*data)); @@ -544,11 +544,11 @@ static int prefix_LRE(struct ccw1 *ccw, struct PFX_eckd_data *pfxdata, ccw->flags = 0; if (cmd == DASD_ECKD_CCW_WRITE_FULL_TRACK) { ccw->count = sizeof(*pfxdata) + 2; - ccw->cda = (__u32) __pa(pfxdata); + ccw->cda = (__u32)virt_to_phys(pfxdata); memset(pfxdata, 0, sizeof(*pfxdata) + 2); } else { ccw->count = sizeof(*pfxdata); - ccw->cda = (__u32) __pa(pfxdata); + ccw->cda = (__u32)virt_to_phys(pfxdata); memset(pfxdata, 0, sizeof(*pfxdata)); } @@ -615,7 +615,7 @@ locate_record(struct ccw1 *ccw, struct LO_eckd_data *data, unsigned int trk, ccw->cmd_code = DASD_ECKD_CCW_LOCATE_RECORD; ccw->flags = 0; ccw->count = 16; - ccw->cda = (__u32) __pa(data); + ccw->cda = (__u32)virt_to_phys(data); memset(data, 0, sizeof(struct LO_eckd_data)); sector = 0; @@ -830,7 +830,7 @@ static void dasd_eckd_fill_rcd_cqr(struct dasd_device *device, ccw = cqr->cpaddr; ccw->cmd_code = DASD_ECKD_CCW_RCD; ccw->flags = 0; - ccw->cda = (__u32)(addr_t)rcd_buffer; + ccw->cda = (__u32)virt_to_phys(rcd_buffer); ccw->count = DASD_ECKD_RCD_DATA_SIZE; cqr->magic = DASD_ECKD_MAGIC; @@ -858,7 +858,7 @@ static void read_conf_cb(struct dasd_ccw_req *cqr, void *data) if (cqr->status != DASD_CQR_DONE) { ccw = cqr->cpaddr; - rcd_buffer = (__u8 *)((addr_t) ccw->cda); + rcd_buffer = phys_to_virt(ccw->cda); memset(rcd_buffer, 0, sizeof(*rcd_buffer)); rcd_buffer[0] = 0xE5; @@ -1547,7 +1547,7 @@ static int dasd_eckd_read_features(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_PSF; ccw->count = sizeof(struct dasd_psf_prssd_data); ccw->flags |= CCW_FLAG_CC; - ccw->cda = (__u32)(addr_t) prssdp; + ccw->cda = (__u32)virt_to_phys(prssdp); /* Read Subsystem Data - feature codes */ features = (struct dasd_rssd_features *) (prssdp + 1); @@ -1556,7 +1556,7 @@ static int dasd_eckd_read_features(struct dasd_device *device) ccw++; ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = sizeof(struct dasd_rssd_features); - ccw->cda = (__u32)(addr_t) features; + ccw->cda = (__u32)virt_to_phys(features); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; @@ -1616,7 +1616,7 @@ static int dasd_eckd_read_vol_info(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_PSF; ccw->count = sizeof(*prssdp); ccw->flags |= CCW_FLAG_CC; - ccw->cda = (__u32)(addr_t)prssdp; + ccw->cda = (__u32)virt_to_phys(prssdp); /* Read Subsystem Data - Volume Storage Query */ vsq = (struct dasd_rssd_vsq *)(prssdp + 1); @@ -1626,7 +1626,7 @@ static int dasd_eckd_read_vol_info(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = sizeof(*vsq); ccw->flags |= CCW_FLAG_SLI; - ccw->cda = (__u32)(addr_t)vsq; + ccw->cda = (__u32)virt_to_phys(vsq); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; @@ -1801,7 +1801,7 @@ static int dasd_eckd_read_ext_pool_info(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_PSF; ccw->count = sizeof(*prssdp); ccw->flags |= CCW_FLAG_CC; - ccw->cda = (__u32)(addr_t)prssdp; + ccw->cda = (__u32)virt_to_phys(prssdp); lcq = (struct dasd_rssd_lcq *)(prssdp + 1); memset(lcq, 0, sizeof(*lcq)); @@ -1810,7 +1810,7 @@ static int dasd_eckd_read_ext_pool_info(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = sizeof(*lcq); ccw->flags |= CCW_FLAG_SLI; - ccw->cda = (__u32)(addr_t)lcq; + ccw->cda = (__u32)virt_to_phys(lcq); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; @@ -1907,7 +1907,7 @@ static struct dasd_ccw_req *dasd_eckd_build_psf_ssc(struct dasd_device *device, } ccw = cqr->cpaddr; ccw->cmd_code = DASD_ECKD_CCW_PSF; - ccw->cda = (__u32)(addr_t)psf_ssc_data; + ccw->cda = (__u32)virt_to_phys(psf_ssc_data); ccw->count = 66; cqr->startdev = device; @@ -2262,7 +2262,7 @@ dasd_eckd_analysis_ccw(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_READ_COUNT; ccw->flags = 0; ccw->count = 8; - ccw->cda = (__u32)(addr_t) count_data; + ccw->cda = (__u32)virt_to_phys(count_data); ccw++; count_data++; } @@ -2276,7 +2276,7 @@ dasd_eckd_analysis_ccw(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_READ_COUNT; ccw->flags = 0; ccw->count = 8; - ccw->cda = (__u32)(addr_t) count_data; + ccw->cda = (__u32)virt_to_phys(count_data); cqr->block = NULL; cqr->startdev = device; @@ -2647,7 +2647,7 @@ dasd_eckd_build_check(struct dasd_device *base, struct format_data_t *fdata, ccw->cmd_code = DASD_ECKD_CCW_READ_COUNT; ccw->flags = CCW_FLAG_SLI; ccw->count = 8; - ccw->cda = (__u32)(addr_t) fmt_buffer; + ccw->cda = (__u32)virt_to_phys(fmt_buffer); ccw++; fmt_buffer++; } @@ -2857,7 +2857,7 @@ dasd_eckd_build_format(struct dasd_device *base, struct dasd_device *startdev, ccw->cmd_code = DASD_ECKD_CCW_WRITE_RECORD_ZERO; ccw->flags = CCW_FLAG_SLI; ccw->count = 8; - ccw->cda = (__u32)(addr_t) ect; + ccw->cda = (__u32)virt_to_phys(ect); ccw++; } if ((intensity & ~0x08) & 0x04) { /* erase track */ @@ -2872,7 +2872,7 @@ dasd_eckd_build_format(struct dasd_device *base, struct dasd_device *startdev, ccw->cmd_code = DASD_ECKD_CCW_WRITE_CKD; ccw->flags = CCW_FLAG_SLI; ccw->count = 8; - ccw->cda = (__u32)(addr_t) ect; + ccw->cda = (__u32)virt_to_phys(ect); } else { /* write remaining records */ for (i = 0; i < rpt; i++) { ect = (struct eckd_count *) data; @@ -2907,7 +2907,7 @@ dasd_eckd_build_format(struct dasd_device *base, struct dasd_device *startdev, DASD_ECKD_CCW_WRITE_CKD_MT; ccw->flags = CCW_FLAG_SLI; ccw->count = 8; - ccw->cda = (__u32)(addr_t) ect; + ccw->cda = (__u32)virt_to_phys(ect); ccw++; } } @@ -3821,7 +3821,7 @@ dasd_eckd_dso_ras(struct dasd_device *device, struct dasd_block *block, } ccw = cqr->cpaddr; - ccw->cda = (__u32)(addr_t)cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); ccw->cmd_code = DASD_ECKD_CCW_DSO; ccw->count = size; @@ -4090,11 +4090,11 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( ccw->cmd_code = rcmd; ccw->count = count; if (idal_is_needed(dst, blksize)) { - ccw->cda = (__u32)(addr_t) idaws; + ccw->cda = (__u32)virt_to_phys(idaws); ccw->flags = CCW_FLAG_IDA; idaws = idal_create_words(idaws, dst, blksize); } else { - ccw->cda = (__u32)(addr_t) dst; + ccw->cda = (__u32)virt_to_phys(dst); ccw->flags = 0; } ccw++; @@ -4228,7 +4228,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( ccw[-1].flags |= CCW_FLAG_CC; ccw->cmd_code = cmd; ccw->count = len_to_track_end; - ccw->cda = (__u32)(addr_t)idaws; + ccw->cda = (__u32)virt_to_phys(idaws); ccw->flags = CCW_FLAG_IDA; ccw++; recid += count; @@ -4244,7 +4244,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( * idaw ends */ if (!idaw_dst) { - if (__pa(dst) & (IDA_BLOCK_SIZE-1)) { + if ((__u32)virt_to_phys(dst) & (IDA_BLOCK_SIZE - 1)) { dasd_sfree_request(cqr, startdev); return ERR_PTR(-ERANGE); } else @@ -4264,7 +4264,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( * idal_create_words will handle cases where idaw_len * is larger then IDA_BLOCK_SIZE */ - if (!(__pa(idaw_dst + idaw_len) & (IDA_BLOCK_SIZE-1))) + if (!((__u32)virt_to_phys(idaw_dst + idaw_len) & (IDA_BLOCK_SIZE - 1))) end_idaw = 1; /* We also need to end the idaw at track end */ if (!len_to_track_end) { @@ -4817,7 +4817,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_raw(struct dasd_device *startdev, ccw->count = 57326; /* 64k map to one track */ len_to_track_end = 65536 - start_padding_sectors * 512; - ccw->cda = (__u32)(addr_t)idaws; + ccw->cda = (__u32)virt_to_phys(idaws); ccw->flags |= CCW_FLAG_IDA; ccw->flags |= CCW_FLAG_SLI; ccw++; @@ -4836,7 +4836,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_raw(struct dasd_device *startdev, ccw->count = 57326; /* 64k map to one track */ len_to_track_end = 65536; - ccw->cda = (__u32)(addr_t)idaws; + ccw->cda = (__u32)virt_to_phys(idaws); ccw->flags |= CCW_FLAG_IDA; ccw->flags |= CCW_FLAG_SLI; ccw++; @@ -4893,9 +4893,9 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req) ccw++; if (dst) { if (ccw->flags & CCW_FLAG_IDA) - cda = *((char **)((addr_t) ccw->cda)); + cda = *((char **)phys_to_virt(ccw->cda)); else - cda = (char *)((addr_t) ccw->cda); + cda = phys_to_virt(ccw->cda); if (dst != cda) { if (rq_data_dir(req) == READ) memcpy(dst, cda, bv.bv_len); @@ -5045,7 +5045,7 @@ dasd_eckd_release(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_RELEASE; ccw->flags |= CCW_FLAG_SLI; ccw->count = 32; - ccw->cda = (__u32)(addr_t) cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); cqr->startdev = device; cqr->memdev = device; clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); @@ -5100,7 +5100,7 @@ dasd_eckd_reserve(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_RESERVE; ccw->flags |= CCW_FLAG_SLI; ccw->count = 32; - ccw->cda = (__u32)(addr_t) cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); cqr->startdev = device; cqr->memdev = device; clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); @@ -5154,7 +5154,7 @@ dasd_eckd_steal_lock(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_SLCK; ccw->flags |= CCW_FLAG_SLI; ccw->count = 32; - ccw->cda = (__u32)(addr_t) cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); cqr->startdev = device; cqr->memdev = device; clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); @@ -5215,7 +5215,7 @@ static int dasd_eckd_snid(struct dasd_device *device, ccw->cmd_code = DASD_ECKD_CCW_SNID; ccw->flags |= CCW_FLAG_SLI; ccw->count = 12; - ccw->cda = (__u32)(addr_t) cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); cqr->startdev = device; cqr->memdev = device; clear_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); @@ -5282,7 +5282,7 @@ dasd_eckd_performance(struct dasd_device *device, void __user *argp) ccw->cmd_code = DASD_ECKD_CCW_PSF; ccw->count = sizeof(struct dasd_psf_prssd_data); ccw->flags |= CCW_FLAG_CC; - ccw->cda = (__u32)(addr_t) prssdp; + ccw->cda = (__u32)virt_to_phys(prssdp); /* Read Subsystem Data - Performance Statistics */ stats = (struct dasd_rssd_perf_stats_t *) (prssdp + 1); @@ -5291,7 +5291,7 @@ dasd_eckd_performance(struct dasd_device *device, void __user *argp) ccw++; ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = sizeof(struct dasd_rssd_perf_stats_t); - ccw->cda = (__u32)(addr_t) stats; + ccw->cda = (__u32)virt_to_phys(stats); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; @@ -5435,7 +5435,7 @@ static int dasd_symm_io(struct dasd_device *device, void __user *argp) ccw->cmd_code = DASD_ECKD_CCW_PSF; ccw->count = usrparm.psf_data_len; ccw->flags |= CCW_FLAG_CC; - ccw->cda = (__u32)(addr_t) psf_data; + ccw->cda = (__u32)virt_to_phys(psf_data); ccw++; @@ -5443,7 +5443,7 @@ static int dasd_symm_io(struct dasd_device *device, void __user *argp) ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = usrparm.rssd_result_len; ccw->flags = CCW_FLAG_SLI ; - ccw->cda = (__u32)(addr_t) rssd_result; + ccw->cda = (__u32)virt_to_phys(rssd_result); rc = dasd_sleep_on(cqr); if (rc) @@ -5512,9 +5512,9 @@ dasd_eckd_dump_ccw_range(struct ccw1 *from, struct ccw1 *to, char *page) /* get pointer to data (consider IDALs) */ if (from->flags & CCW_FLAG_IDA) - datap = (char *) *((addr_t *) (addr_t) from->cda); + datap = (char *)*((addr_t *)phys_to_virt(from->cda)); else - datap = (char *) ((addr_t) from->cda); + datap = phys_to_virt(from->cda); /* dump data (max 128 bytes) */ for (count = 0; count < from->count && count < 128; count++) { @@ -5585,7 +5585,7 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, len += sprintf(page + len, PRINTK_HEADER " device %s: Failing CCW: %p\n", dev_name(&device->cdev->dev), - (void *) (addr_t) irb->scsw.cmd.cpa); + phys_to_virt(irb->scsw.cmd.cpa)); if (irb->esw.esw0.erw.cons) { for (sl = 0; sl < 4; sl++) { len += sprintf(page + len, PRINTK_HEADER @@ -5632,8 +5632,7 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, /* print failing CCW area (maximum 4) */ /* scsw->cda is either valid or zero */ from = ++to; - fail = (struct ccw1 *)(addr_t) - irb->scsw.cmd.cpa; /* failing CCW */ + fail = phys_to_virt(irb->scsw.cmd.cpa); /* failing CCW */ if (from < fail - 2) { from = fail - 2; /* there is a gap - print header */ printk(KERN_ERR PRINTK_HEADER "......\n"); @@ -5687,13 +5686,12 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device, len += sprintf(page + len, PRINTK_HEADER " device %s: Failing TCW: %p\n", dev_name(&device->cdev->dev), - (void *) (addr_t) irb->scsw.tm.tcw); + phys_to_virt(irb->scsw.tm.tcw)); tsb = NULL; sense = NULL; if (irb->scsw.tm.tcw && (irb->scsw.tm.fcxs & 0x01)) - tsb = tcw_get_tsb( - (struct tcw *)(unsigned long)irb->scsw.tm.tcw); + tsb = tcw_get_tsb(phys_to_virt(irb->scsw.tm.tcw)); if (tsb) { len += sprintf(page + len, PRINTK_HEADER @@ -5917,7 +5915,7 @@ retry: ccw->count = sizeof(struct dasd_psf_prssd_data); ccw->flags |= CCW_FLAG_CC; ccw->flags |= CCW_FLAG_SLI; - ccw->cda = (__u32)(addr_t) prssdp; + ccw->cda = (__u32)virt_to_phys(prssdp); /* Read Subsystem Data - message buffer */ message_buf = (struct dasd_rssd_messages *) (prssdp + 1); @@ -5927,7 +5925,7 @@ retry: ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = sizeof(struct dasd_rssd_messages); ccw->flags |= CCW_FLAG_SLI; - ccw->cda = (__u32)(addr_t) message_buf; + ccw->cda = (__u32)virt_to_phys(message_buf); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; @@ -6008,14 +6006,14 @@ static int dasd_eckd_query_host_access(struct dasd_device *device, ccw->count = sizeof(struct dasd_psf_prssd_data); ccw->flags |= CCW_FLAG_CC; ccw->flags |= CCW_FLAG_SLI; - ccw->cda = (__u32)(addr_t) prssdp; + ccw->cda = (__u32)virt_to_phys(prssdp); /* Read Subsystem Data - query host access */ ccw++; ccw->cmd_code = DASD_ECKD_CCW_RSSD; ccw->count = sizeof(struct dasd_psf_query_host_access); ccw->flags |= CCW_FLAG_SLI; - ccw->cda = (__u32)(addr_t) host_access; + ccw->cda = (__u32)virt_to_phys(host_access); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; @@ -6351,7 +6349,7 @@ dasd_eckd_psf_cuir_response(struct dasd_device *device, int response, psf_cuir->ssid = device->path[pos].ssid; ccw = cqr->cpaddr; ccw->cmd_code = DASD_ECKD_CCW_PSF; - ccw->cda = (__u32)(addr_t)psf_cuir; + ccw->cda = (__u32)virt_to_phys(psf_cuir); ccw->flags = CCW_FLAG_SLI; ccw->count = sizeof(struct dasd_psf_cuir_response); diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index d4d31cd11d26..a4cc772208a6 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -491,7 +491,7 @@ int dasd_eer_enable(struct dasd_device *device) ccw->cmd_code = DASD_ECKD_CCW_SNSS; ccw->count = SNSS_DATA_SIZE; ccw->flags = 0; - ccw->cda = (__u32)(addr_t) cqr->data; + ccw->cda = (__u32)virt_to_phys(cqr->data); cqr->buildclk = get_tod_clock(); cqr->status = DASD_CQR_FILLED; diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index cddfb01a3dca..bcb67fa747a7 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -83,7 +83,7 @@ define_extent(struct ccw1 * ccw, struct DE_fba_data *data, int rw, ccw->cmd_code = DASD_FBA_CCW_DEFINE_EXTENT; ccw->flags = 0; ccw->count = 16; - ccw->cda = (__u32) __pa(data); + ccw->cda = (__u32)virt_to_phys(data); memset(data, 0, sizeof (struct DE_fba_data)); if (rw == WRITE) (data->mask).perm = 0x0; @@ -103,7 +103,7 @@ locate_record(struct ccw1 * ccw, struct LO_fba_data *data, int rw, ccw->cmd_code = DASD_FBA_CCW_LOCATE; ccw->flags = 0; ccw->count = 8; - ccw->cda = (__u32) __pa(data); + ccw->cda = (__u32)virt_to_phys(data); memset(data, 0, sizeof (struct LO_fba_data)); if (rw == WRITE) data->operation.cmd = 0x5; @@ -262,7 +262,7 @@ static void ccw_write_zero(struct ccw1 *ccw, int count) ccw->cmd_code = DASD_FBA_CCW_WRITE; ccw->flags |= CCW_FLAG_SLI; ccw->count = count; - ccw->cda = (__u32) (addr_t) dasd_fba_zero_page; + ccw->cda = (__u32)virt_to_phys(dasd_fba_zero_page); } /* @@ -528,11 +528,11 @@ static struct dasd_ccw_req *dasd_fba_build_cp_regular( ccw->cmd_code = cmd; ccw->count = block->bp_block; if (idal_is_needed(dst, blksize)) { - ccw->cda = (__u32)(addr_t) idaws; + ccw->cda = (__u32)virt_to_phys(idaws); ccw->flags = CCW_FLAG_IDA; idaws = idal_create_words(idaws, dst, blksize); } else { - ccw->cda = (__u32)(addr_t) dst; + ccw->cda = (__u32)virt_to_phys(dst); ccw->flags = 0; } ccw++; @@ -590,9 +590,9 @@ dasd_fba_free_cp(struct dasd_ccw_req *cqr, struct request *req) ccw++; if (dst) { if (ccw->flags & CCW_FLAG_IDA) - cda = *((char **)((addr_t) ccw->cda)); + cda = *((char **)phys_to_virt(ccw->cda)); else - cda = (char *)((addr_t) ccw->cda); + cda = phys_to_virt(ccw->cda); if (dst != cda) { if (rq_data_dir(req) == READ) memcpy(dst, cda, bv.bv_len); -- cgit v1.2.3 From 460e9bed82e49db1b823dcb4e421783854d86c40 Mon Sep 17 00:00:00 2001 From: Qiheng Lin Date: Fri, 10 Feb 2023 01:02:53 +0100 Subject: s390/dasd: Fix potential memleak in dasd_eckd_init() `dasd_reserve_req` is allocated before `dasd_vol_info_req`, and it also needs to be freed before the error returns, just like the other cases in this function. Fixes: 9e12e54c7a8f ("s390/dasd: Handle out-of-space constraint") Signed-off-by: Qiheng Lin Link: https://lore.kernel.org/r/20221208133809.16796-1-linqiheng@huawei.com Signed-off-by: Stefan Haberland Link: https://lore.kernel.org/r/20230210000253.1644903-3-sth@linux.ibm.com Signed-off-by: Jens Axboe --- drivers/s390/block/dasd_eckd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index 03ecd8846378..1a69f97e88fb 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -6954,8 +6954,10 @@ dasd_eckd_init(void) return -ENOMEM; dasd_vol_info_req = kmalloc(sizeof(*dasd_vol_info_req), GFP_KERNEL | GFP_DMA); - if (!dasd_vol_info_req) + if (!dasd_vol_info_req) { + kfree(dasd_reserve_req); return -ENOMEM; + } pe_handler_worker = kmalloc(sizeof(*pe_handler_worker), GFP_KERNEL | GFP_DMA); if (!pe_handler_worker) { -- cgit v1.2.3 From 2f1e07dda1e1310873647abc40bbc49eaf3b10e3 Mon Sep 17 00:00:00 2001 From: Liu Xiaodong Date: Fri, 10 Feb 2023 09:13:56 -0500 Subject: block: ublk: check IO buffer based on flag need_get_data Currently, uring_cmd with UBLK_IO_FETCH_REQ or UBLK_IO_COMMIT_AND_FETCH_REQ is always checked whether userspace server has provided IO buffer even flag UBLK_F_NEED_GET_DATA is configured. This is a excessive check. If UBLK_F_NEED_GET_DATA is configured, FETCH_RQ doesn't need to provide IO buffer; COMMIT_AND_FETCH_REQ also doesn't need to do that if the IO type is not READ. Check ub_cmd->addr together with ublk_need_get_data() and IO type in ublk_ch_uring_cmd(). With this fix, userspace server doesn't need to preserve buffers for every ublk_io when flag UBLK_F_NEED_GET_DATA is configured, in order to save memory. Signed-off-by: Liu Xiaodong Fixes: c86019ff75c1 ("ublk_drv: add support for UBLK_IO_NEED_GET_DATA") Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230210141356.112321-1-xiaodong.liu@intel.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e6eceee44366..f48d213fb65e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1265,6 +1265,7 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) u32 cmd_op = cmd->cmd_op; unsigned tag = ub_cmd->tag; int ret = -EINVAL; + struct request *req; pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", __func__, cmd->cmd_op, ub_cmd->q_id, tag, @@ -1315,8 +1316,8 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) */ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out; - /* FETCH_RQ has to provide IO buffer */ - if (!ub_cmd->addr) + /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */ + if (!ub_cmd->addr && !ublk_need_get_data(ubq)) goto out; io->cmd = cmd; io->flags |= UBLK_IO_FLAG_ACTIVE; @@ -1325,8 +1326,12 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: - /* FETCH_RQ has to provide IO buffer */ - if (!ub_cmd->addr) + req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is + * not enabled or it is Read IO. + */ + if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) goto out; if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; -- cgit v1.2.3 From 8f0edf45bb676ec3558d6b668ad3f6a7d54cf601 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Feb 2023 10:03:46 -0800 Subject: nvme-pci: fix freeing single sgl There may only be a single DMA mapped entry from multiple physical segments, which means we don't allocate a separte SGL list. Check the number of allocations prior to know if we need to free something. Freeing a single list allocation is the same for both PRP and SGL usages, so we don't need to check the use_sgl flag anymore. Fixes: 01df742d8c5c0 ("nvme-pci: remove SGL segment descriptors") Reported-by: Niklas Schnelle Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Tested-by: Niklas Schnelle --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a331fbfa9a66..47d6b0023e3a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -556,7 +556,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) if (iod->nr_allocations == 0) dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, iod->first_dma); - else if (iod->use_sgl) + else if (iod->nr_allocations == 1) dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, iod->first_dma); else -- cgit v1.2.3 From b6c0c237bea191fb99b6c2de093262402b0159a6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 10 Feb 2023 10:03:47 -0800 Subject: nvme-pci: remove iod use_sgls It's not used anywhere anymore, so remove it. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 47d6b0023e3a..d68e2db00d0d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -230,7 +230,6 @@ union nvme_descriptor { struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - bool use_sgl; bool aborted; s8 nr_allocations; /* PRP list pool allocations. 0 means small pool in use */ @@ -808,8 +807,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_free_sg; } - iod->use_sgl = nvme_pci_use_sgls(dev, req, iod->sgt.nents); - if (iod->use_sgl) + if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); else ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); -- cgit v1.2.3 From db0ccc44a20b4bb3039c0f6885a1f9c3323c7673 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 16 Feb 2023 07:57:32 -0700 Subject: brd: return 0/-error from brd_insert_page() It currently returns a page, but callers just check for NULL/page to gauge success. Clean this up and return the appropriate error directly instead. Cc: stable@vger.kernel.org # 5.10+ Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/brd.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 20acc4a1fd6d..15a148d5aad9 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -78,11 +78,9 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) } /* - * Look up and return a brd's page for a given sector. - * If one does not exist, allocate an empty page, and insert that. Then - * return it. + * Insert a new page for a given sector, if one does not already exist. */ -static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) +static int brd_insert_page(struct brd_device *brd, sector_t sector) { pgoff_t idx; struct page *page; @@ -90,7 +88,7 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) page = brd_lookup_page(brd, sector); if (page) - return page; + return 0; /* * Must use NOIO because we don't want to recurse back into the @@ -99,11 +97,11 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM; page = alloc_page(gfp_flags); if (!page) - return NULL; + return -ENOMEM; if (radix_tree_preload(GFP_NOIO)) { __free_page(page); - return NULL; + return -ENOMEM; } spin_lock(&brd->brd_lock); @@ -120,8 +118,7 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) spin_unlock(&brd->brd_lock); radix_tree_preload_end(); - - return page; + return 0; } /* @@ -174,16 +171,17 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) { unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; size_t copy; + int ret; copy = min_t(size_t, n, PAGE_SIZE - offset); - if (!brd_insert_page(brd, sector)) - return -ENOSPC; + ret = brd_insert_page(brd, sector); + if (ret) + return ret; if (copy < n) { sector += copy >> SECTOR_SHIFT; - if (!brd_insert_page(brd, sector)) - return -ENOSPC; + ret = brd_insert_page(brd, sector); } - return 0; + return ret; } /* -- cgit v1.2.3 From 6ded703c56c21bfb259725d4f1831a5feb563e9b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 16 Feb 2023 08:01:08 -0700 Subject: brd: check for REQ_NOWAIT and set correct page allocation mask If REQ_NOWAIT is set, then do a non-blocking allocation if the operation is a write and we need to insert a new page. Currently REQ_NOWAIT cannot be set as the queue isn't marked as supporting nowait, this change is in preparation for allowing that. radix_tree_preload() warns on attempting to call it with an allocation mask that doesn't allow blocking. While that warning could arguably be removed, we need to handle radix insertion failures anyway as they are more likely if we cannot block to get memory. Remove legacy BUG_ON()'s and turn them into proper errors instead, one for the allocation failure and one for finding a page that doesn't match the correct index. Cc: stable@vger.kernel.org # 5.10+ Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/brd.c | 48 ++++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 15a148d5aad9..00f3c5b51a01 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -80,26 +80,21 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) /* * Insert a new page for a given sector, if one does not already exist. */ -static int brd_insert_page(struct brd_device *brd, sector_t sector) +static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) { pgoff_t idx; struct page *page; - gfp_t gfp_flags; + int ret = 0; page = brd_lookup_page(brd, sector); if (page) return 0; - /* - * Must use NOIO because we don't want to recurse back into the - * block or filesystem layers from page reclaim. - */ - gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM; - page = alloc_page(gfp_flags); + page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); if (!page) return -ENOMEM; - if (radix_tree_preload(GFP_NOIO)) { + if (gfpflags_allow_blocking(gfp) && radix_tree_preload(gfp)) { __free_page(page); return -ENOMEM; } @@ -110,15 +105,17 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector) if (radix_tree_insert(&brd->brd_pages, idx, page)) { __free_page(page); page = radix_tree_lookup(&brd->brd_pages, idx); - BUG_ON(!page); - BUG_ON(page->index != idx); + if (!page) + ret = -ENOMEM; + else if (page->index != idx) + ret = -EIO; } else { brd->brd_nr_pages++; } spin_unlock(&brd->brd_lock); radix_tree_preload_end(); - return 0; + return ret; } /* @@ -167,19 +164,20 @@ static void brd_free_pages(struct brd_device *brd) /* * copy_to_brd_setup must be called before copy_to_brd. It may sleep. */ -static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) +static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n, + gfp_t gfp) { unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; size_t copy; int ret; copy = min_t(size_t, n, PAGE_SIZE - offset); - ret = brd_insert_page(brd, sector); + ret = brd_insert_page(brd, sector, gfp); if (ret) return ret; if (copy < n) { sector += copy >> SECTOR_SHIFT; - ret = brd_insert_page(brd, sector); + ret = brd_insert_page(brd, sector, gfp); } return ret; } @@ -254,20 +252,26 @@ static void copy_from_brd(void *dst, struct brd_device *brd, * Process a single bvec of a bio. */ static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, enum req_op op, + unsigned int len, unsigned int off, blk_opf_t opf, sector_t sector) { void *mem; int err = 0; - if (op_is_write(op)) { - err = copy_to_brd_setup(brd, sector, len); + if (op_is_write(opf)) { + /* + * Must use NOIO because we don't want to recurse back into the + * block or filesystem layers from page reclaim. + */ + gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO; + + err = copy_to_brd_setup(brd, sector, len, gfp); if (err) goto out; } mem = kmap_atomic(page); - if (!op_is_write(op)) { + if (!op_is_write(opf)) { copy_from_brd(mem + off, brd, sector, len); flush_dcache_page(page); } else { @@ -296,8 +300,12 @@ static void brd_submit_bio(struct bio *bio) (len & (SECTOR_SIZE - 1))); err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, - bio_op(bio), sector); + bio->bi_opf, sector); if (err) { + if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) { + bio_wouldblock_error(bio); + return; + } bio_io_error(bio); return; } -- cgit v1.2.3 From 67205f80be9910207481406c47f7d85e703fb2e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Feb 2023 16:43:47 -0700 Subject: brd: mark as nowait compatible By default, non-mq drivers do not support nowait. This causes io_uring to use a slower path as the driver cannot be trust not to block. brd can safely set the nowait flag, as worst case all it does is a NOIO allocation. For io_uring, this makes a substantial difference. Before: submitter=0, tid=453, file=/dev/ram0, node=-1 polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=128 Engine=io_uring, sq_ring=128, cq_ring=128 IOPS=440.03K, BW=1718MiB/s, IOS/call=32/31 IOPS=428.96K, BW=1675MiB/s, IOS/call=32/32 IOPS=442.59K, BW=1728MiB/s, IOS/call=32/31 IOPS=419.65K, BW=1639MiB/s, IOS/call=32/32 IOPS=426.82K, BW=1667MiB/s, IOS/call=32/31 and after: submitter=0, tid=354, file=/dev/ram0, node=-1 polled=0, fixedbufs=1/0, register_files=1, buffered=0, QD=128 Engine=io_uring, sq_ring=128, cq_ring=128 IOPS=3.37M, BW=13.15GiB/s, IOS/call=32/31 IOPS=3.45M, BW=13.46GiB/s, IOS/call=32/31 IOPS=3.43M, BW=13.42GiB/s, IOS/call=32/32 IOPS=3.43M, BW=13.39GiB/s, IOS/call=32/31 IOPS=3.43M, BW=13.38GiB/s, IOS/call=32/31 or about an 8x in difference. Now that brd is prepared to deal with REQ_NOWAIT reads/writes, mark it as supporting that. Cc: stable@vger.kernel.org # 5.10+ Link: https://lore.kernel.org/linux-block/20230203103005.31290-1-p.raghav@samsung.com/ Signed-off-by: Jens Axboe --- drivers/block/brd.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 00f3c5b51a01..740631dcdd0e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -418,6 +418,7 @@ static int brd_alloc(int i) /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) goto out_cleanup_disk; -- cgit v1.2.3 From 0aa2988e4fd23c0c8b33999d7b47dfbc5e6bf24b Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 17 Feb 2023 17:44:44 +0530 Subject: brd: use radix_tree_maybe_preload instead of radix_tree_preload Unconditionally calling radix_tree_preload_end() results in a OOPS message as the preload is only conditionally called for gfpflags_allow_blocking(). [ 20.267323] BUG: using smp_processor_id() in preemptible [00000000] code: fio/416 [ 20.267837] caller is brd_insert_page.part.0+0xbe/0x190 [brd] [ 20.269436] Call Trace: [ 20.269598] [ 20.269742] dump_stack_lvl+0x32/0x50 [ 20.269982] check_preemption_disabled+0xd1/0xe0 [ 20.270289] brd_insert_page.part.0+0xbe/0x190 [brd] [ 20.270664] brd_submit_bio+0x33f/0xf40 [brd] Use radix_tree_maybe_preload() which does preload only if gfpflags_allow_blocking() is true but also takes the lock. Therefore, unconditionally calling radix_tree_preload_end() should not create any issues and the message disappears. Fixes: 6ded703c56c2 ("brd: check for REQ_NOWAIT and set correct page allocation mask") Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20230217121442.33914-1-p.raghav@samsung.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 740631dcdd0e..a8a77a1efe1e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -94,7 +94,7 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) if (!page) return -ENOMEM; - if (gfpflags_allow_blocking(gfp) && radix_tree_preload(gfp)) { + if (radix_tree_maybe_preload(gfp)) { __free_page(page); return -ENOMEM; } -- cgit v1.2.3