summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 11:39:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 11:39:36 -0700
commit6f59de9bc0d576eb5a5edfea470527902315e924 (patch)
treeeeb477da0caa8e9569074f448169fb792fac90d9 /drivers
parent3e406741b19890c3d8a2ed126aa7c23b106ca9e1 (diff)
parent533c87e2ed742454957f14d7bef9f48d5a72e72d (diff)
Merge tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - ublk updates: - Add support for updating the size of a ublk instance - Zero-copy improvements - Auto-registering of buffers for zero-copy - Series simplifying and improving GET_DATA and request lookup - Series adding quiesce support - Lots of selftests additions - Various cleanups - NVMe updates via Christoph: - add per-node DMA pools and use them for PRP/SGL allocations (Caleb Sander Mateos, Keith Busch) - nvme-fcloop refcounting fixes (Daniel Wagner) - support delayed removal of the multipath node and optionally support the multipath node for private namespaces (Nilay Shroff) - support shared CQs in the PCI endpoint target code (Wilfred Mallawa) - support admin-queue only authentication (Hannes Reinecke) - use the crc32c library instead of the crypto API (Eric Biggers) - misc cleanups (Christoph Hellwig, Marcelo Moreira, Hannes Reinecke, Leon Romanovsky, Gustavo A. R. Silva) - MD updates via Yu: - Fix that normal IO can be starved by sync IO, found by mkfs on newly created large raid5, with some clean up patches for bdev inflight counters - Clean up brd, getting rid of atomic kmaps and bvec poking - Add loop driver specifically for zoned IO testing - Eliminate blk-rq-qos calls with a static key, if not enabled - Improve hctx locking for when a plug has IO for multiple queues pending - Remove block layer bouncing support, which in turn means we can remove the per-node bounce stat as well - Improve blk-throttle support - Improve delay support for blk-throttle - Improve brd discard support - Unify IO scheduler switching. This should also fix a bunch of lockdep warnings we've been seeing, after enabling lockdep support for queue freezing/unfreezeing - Add support for block write streams via FDP (flexible data placement) on NVMe - Add a bunch of block helpers, facilitating the removal of a bunch of duplicated boilerplate code - Remove obsolete BLK_MQ pci and virtio Kconfig options - Add atomic/untorn write support to blktrace - Various little cleanups and fixes * tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux: (186 commits) selftests: ublk: add test for UBLK_F_QUIESCE ublk: add feature UBLK_F_QUIESCE selftests: ublk: add test case for UBLK_U_CMD_UPDATE_SIZE traceevent/block: Add REQ_ATOMIC flag to block trace events ublk: run auto buf unregisgering in same io_ring_ctx with registering io_uring: add helper io_uring_cmd_ctx_handle() ublk: remove io argument from ublk_auto_buf_reg_fallback() ublk: handle ublk_set_auto_buf_reg() failure correctly in ublk_fetch() selftests: ublk: add test for covering UBLK_AUTO_BUF_REG_FALLBACK selftests: ublk: support UBLK_F_AUTO_BUF_REG ublk: support UBLK_AUTO_BUF_REG_FALLBACK ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG ublk: prepare for supporting to register request buffer automatically ublk: convert to refcount_t selftests: ublk: make IO & device removal test more stressful nvme: rename nvme_mpath_shutdown_disk to nvme_mpath_remove_disk nvme: introduce multipath_always_on module param nvme-multipath: introduce delayed removal of the multipath head node nvme-pci: derive and better document max segments limits nvme-pci: use struct_size for allocation struct nvme_dev ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/base/node.c2
-rw-r--r--drivers/block/Kconfig19
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/brd.c225
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/rnbd/rnbd-srv.c7
-rw-r--r--drivers/block/ublk_drv.c569
-rw-r--r--drivers/block/virtio_blk.c4
-rw-r--r--drivers/block/zloop.c1385
-rw-r--r--drivers/cdrom/cdrom.c3
-rw-r--r--drivers/md/bcache/super.c3
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-integrity.c16
-rw-r--r--drivers/md/dm-raid.c3
-rw-r--r--drivers/md/md.c190
-rw-r--r--drivers/md/md.h18
-rw-r--r--drivers/md/raid1.c3
-rw-r--r--drivers/md/raid10.c9
-rw-r--r--drivers/md/raid5.c8
-rw-r--r--drivers/nvme/common/auth.c15
-rw-r--r--drivers/nvme/host/auth.c30
-rw-r--r--drivers/nvme/host/core.c205
-rw-r--r--drivers/nvme/host/fc.c13
-rw-r--r--drivers/nvme/host/multipath.c206
-rw-r--r--drivers/nvme/host/nvme.h31
-rw-r--r--drivers/nvme/host/pci.c300
-rw-r--r--drivers/nvme/host/sysfs.c7
-rw-r--r--drivers/nvme/host/tcp.c14
-rw-r--r--drivers/nvme/target/admin-cmd.c31
-rw-r--r--drivers/nvme/target/auth.c21
-rw-r--r--drivers/nvme/target/core.c94
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/fabrics-cmd.c12
-rw-r--r--drivers/nvme/target/fc.c96
-rw-r--r--drivers/nvme/target/fcloop.c439
-rw-r--r--drivers/nvme/target/loop.c29
-rw-r--r--drivers/nvme/target/nvmet.h24
-rw-r--r--drivers/nvme/target/pci-epf.c14
-rw-r--r--drivers/nvme/target/rdma.c8
-rw-r--r--drivers/nvme/target/tcp.c100
-rw-r--r--drivers/scsi/Kconfig3
-rw-r--r--drivers/scsi/aha152x.c1
-rw-r--r--drivers/scsi/imm.c1
-rw-r--r--drivers/scsi/ppa.c1
-rw-r--r--drivers/scsi/scsi_ioctl.c2
-rw-r--r--drivers/scsi/scsi_lib.c6
-rw-r--r--drivers/usb/storage/usb.c20
47 files changed, 3231 insertions, 963 deletions
diff --git a/drivers/base/node.c b/drivers/base/node.c
index cd13ef287011..618712071a1e 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -468,7 +468,7 @@ static ssize_t node_read_meminfo(struct device *dev,
nid, K(node_page_state(pgdat, NR_PAGETABLE)),
nid, K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
nid, 0UL,
- nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
+ nid, 0UL,
nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
nid, K(sreclaimable +
node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e48b24be45ee..0f70e2374e7f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -407,4 +407,23 @@ config BLKDEV_UBLK_LEGACY_OPCODES
source "drivers/block/rnbd/Kconfig"
+config BLK_DEV_ZONED_LOOP
+ tristate "Zoned loopback device support"
+ depends on BLK_DEV_ZONED
+ help
+ Saying Y here will allow you to use create a zoned block device using
+ regular files for zones (one file per zones). This is useful to test
+ file systems, device mapper and applications that support zoned block
+ devices. To create a zoned loop device, no user utility is needed, a
+ zoned loop device can be created (or re-started) using a command
+ like:
+
+ echo "add id=0,zone_size_mb=256,capacity_mb=16384,conv_zones=11" > \
+ /dev/zloop-control
+
+ See Documentation/admin-guide/blockdev/zoned_loop.rst for usage
+ details.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1105a2d4fdcb..097707aca725 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
+obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 292f127cae0a..b1be6c510372 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -54,32 +54,33 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
/*
* Insert a new page for a given sector, if one does not already exist.
*/
-static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
+ blk_opf_t opf)
+ __releases(rcu)
+ __acquires(rcu)
{
- pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
- struct page *page;
- int ret = 0;
-
- page = brd_lookup_page(brd, sector);
- if (page)
- return 0;
+ gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
+ struct page *page, *ret;
+ rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
+ rcu_read_lock();
if (!page)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
xa_lock(&brd->brd_pages);
- ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
- if (!ret)
- brd->brd_nr_pages++;
- xa_unlock(&brd->brd_pages);
-
- if (ret < 0) {
+ ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
+ page, gfp);
+ if (ret) {
+ xa_unlock(&brd->brd_pages);
__free_page(page);
- if (ret == -EBUSY)
- ret = 0;
+ if (xa_is_err(ret))
+ return ERR_PTR(xa_err(ret));
+ return ret;
}
- return ret;
+ brd->brd_nr_pages++;
+ xa_unlock(&brd->brd_pages);
+ return page;
}
/*
@@ -100,143 +101,77 @@ static void brd_free_pages(struct brd_device *brd)
}
/*
- * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ * Process a single segment. The segment is capped to not cross page boundaries
+ * in both the bio and the brd backing memory.
*/
-static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
- gfp_t gfp)
-{
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
- int ret;
-
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- ret = brd_insert_page(brd, sector, gfp);
- if (ret)
- return ret;
- if (copy < n) {
- sector += copy >> SECTOR_SHIFT;
- ret = brd_insert_page(brd, sector, gfp);
- }
- return ret;
-}
-
-/*
- * Copy n bytes from src to the brd starting at sector. Does not sleep.
- */
-static void copy_to_brd(struct brd_device *brd, const void *src,
- sector_t sector, size_t n)
+static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
{
+ struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
+ sector_t sector = bio->bi_iter.bi_sector;
+ u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+ blk_opf_t opf = bio->bi_opf;
struct page *page;
- void *dst;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
+ void *kaddr;
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst + offset, src, copy);
- kunmap_atomic(dst);
-
- if (copy < n) {
- src += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(dst);
- }
-}
+ bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
-/*
- * Copy n bytes to dst from the brd starting at sector. Does not sleep.
- */
-static void copy_from_brd(void *dst, struct brd_device *brd,
- sector_t sector, size_t n)
-{
- struct page *page;
- void *src;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
-
- copy = min_t(size_t, n, PAGE_SIZE - offset);
+ rcu_read_lock();
page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src + offset, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
-
- if (copy < n) {
- dst += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
+ if (!page && op_is_write(opf)) {
+ page = brd_insert_page(brd, sector, opf);
+ if (IS_ERR(page))
+ goto out_error;
}
-}
-
-/*
- * Process a single bvec of a bio.
- */
-static int brd_do_bvec(struct brd_device *brd, struct page *page,
- unsigned int len, unsigned int off, blk_opf_t opf,
- sector_t sector)
-{
- void *mem;
- int err = 0;
+ kaddr = bvec_kmap_local(&bv);
if (op_is_write(opf)) {
- /*
- * Must use NOIO because we don't want to recurse back into the
- * block or filesystem layers from page reclaim.
- */
- gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
-
- err = copy_to_brd_setup(brd, sector, len, gfp);
- if (err)
- goto out;
- }
-
- mem = kmap_atomic(page);
- if (!op_is_write(opf)) {
- copy_from_brd(mem + off, brd, sector, len);
- flush_dcache_page(page);
+ memcpy_to_page(page, offset, kaddr, bv.bv_len);
} else {
- flush_dcache_page(page);
- copy_to_brd(brd, mem + off, sector, len);
+ if (page)
+ memcpy_from_page(kaddr, page, offset, bv.bv_len);
+ else
+ memset(kaddr, 0, bv.bv_len);
}
- kunmap_atomic(mem);
+ kunmap_local(kaddr);
+ rcu_read_unlock();
+
+ bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ return true;
+
+out_error:
+ rcu_read_unlock();
+ if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ return false;
+}
-out:
- return err;
+static void brd_free_one_page(struct rcu_head *head)
+{
+ struct page *page = container_of(head, struct page, rcu_head);
+
+ __free_page(page);
}
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
- sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS;
+ sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
+ sector_t aligned_end = round_down(
+ sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
struct page *page;
- size -= (aligned_sector - sector) * SECTOR_SIZE;
+ if (aligned_end <= aligned_sector)
+ return;
+
xa_lock(&brd->brd_pages);
- while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) {
+ while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- __free_page(page);
+ call_rcu(&page->rcu_head, brd_free_one_page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
- size -= PAGE_SIZE;
}
xa_unlock(&brd->brd_pages);
}
@@ -244,36 +179,18 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
static void brd_submit_bio(struct bio *bio)
{
struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
- sector_t sector = bio->bi_iter.bi_sector;
- struct bio_vec bvec;
- struct bvec_iter iter;
if (unlikely(op_is_discard(bio->bi_opf))) {
- brd_do_discard(brd, sector, bio->bi_iter.bi_size);
+ brd_do_discard(brd, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size);
bio_endio(bio);
return;
}
- bio_for_each_segment(bvec, bio, iter) {
- unsigned int len = bvec.bv_len;
- int err;
-
- /* Don't support un-aligned buffer */
- WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
- (len & (SECTOR_SIZE - 1)));
-
- err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
- bio->bi_opf, sector);
- if (err) {
- if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
- bio_wouldblock_error(bio);
- return;
- }
- bio_io_error(bio);
+ do {
+ if (!brd_rw_bvec(brd, bio))
return;
- }
- sector += len >> SECTOR_SHIFT;
- }
+ } while (bio->bi_iter.bi_size);
bio_endio(bio);
}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 65b96c083b3c..d5cc7bd2875c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -725,7 +725,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
scmd = blk_mq_rq_to_pdu(rq);
if (cgc->buflen) {
- ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+ ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen,
GFP_NOIO);
if (ret)
goto out;
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 2ee6e9bd4e28..2df8941a6b14 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -147,12 +147,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
- if (bio_add_page(bio, virt_to_page(data), datalen,
- offset_in_page(data)) != datalen) {
- rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
- err = -EINVAL;
- goto bio_put;
- }
+ bio_add_virt_nofail(bio, data, datalen);
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
if (bio_has_data(bio) &&
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index dc104c025cd5..6f51072776f1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -50,6 +50,8 @@
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
+#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -64,7 +66,10 @@
| UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY \
| UBLK_F_ZONED \
- | UBLK_F_USER_RECOVERY_FAIL_IO)
+ | UBLK_F_USER_RECOVERY_FAIL_IO \
+ | UBLK_F_UPDATE_SIZE \
+ | UBLK_F_AUTO_BUF_REG \
+ | UBLK_F_QUIESCE)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -77,7 +82,11 @@
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
struct ublk_rq_data {
- struct kref ref;
+ refcount_t ref;
+
+ /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
+ u16 buf_index;
+ void *buf_ctx_handle;
};
struct ublk_uring_cmd_pdu {
@@ -99,6 +108,9 @@ struct ublk_uring_cmd_pdu {
* setup in ublk uring_cmd handler
*/
struct ublk_queue *ubq;
+
+ struct ublk_auto_buf_reg buf;
+
u16 tag;
};
@@ -131,6 +143,14 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+/*
+ * request buffer is registered automatically, so we have to unregister it
+ * before completing this request.
+ *
+ * io_uring will unregister buffer automatically for us during exiting.
+ */
+#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
+
/* atomic RW with ubq->cancel_lock */
#define UBLK_IO_FLAG_CANCELED 0x80000000
@@ -140,7 +160,12 @@ struct ublk_io {
unsigned int flags;
int res;
- struct io_uring_cmd *cmd;
+ union {
+ /* valid if UBLK_IO_FLAG_ACTIVE is set */
+ struct io_uring_cmd *cmd;
+ /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
+ struct request *req;
+ };
};
struct ublk_queue {
@@ -198,13 +223,19 @@ struct ublk_params_header {
__u32 types;
};
+static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
const struct ublk_queue *ubq, int tag, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag);
+
+static inline struct ublksrv_io_desc *
+ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
+{
+ return &ubq->io_cmd_buf[tag];
+}
+
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
@@ -356,8 +387,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
if (ret)
goto free_req;
- ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
- GFP_KERNEL);
+ ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
if (ret)
goto erase_desc;
@@ -477,7 +507,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
static inline void __ublk_complete_rq(struct request *req);
-static void ublk_complete_rq(struct kref *ref);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -609,6 +638,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_AUTO_BUF_REG;
+}
+
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
@@ -616,7 +650,8 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
- return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq);
+ return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
+ !ublk_support_auto_buf_reg(ubq);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@@ -627,8 +662,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
*
* for zero copy, request buffer need to be registered to io_uring
* buffer table, so reference is needed
+ *
+ * For auto buffer register, ublk server still may issue
+ * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
+ * so reference is required too.
*/
- return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq);
+ return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
+ ublk_support_auto_buf_reg(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
@@ -637,7 +677,7 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_init(&data->ref);
+ refcount_set(&data->ref, 1);
}
}
@@ -647,7 +687,7 @@ static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- return kref_get_unless_zero(&data->ref);
+ return refcount_inc_not_zero(&data->ref);
}
return true;
@@ -659,7 +699,8 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_put(&data->ref, ublk_complete_rq);
+ if (refcount_dec_and_test(&data->ref))
+ __ublk_complete_rq(req);
} else {
__ublk_complete_rq(req);
}
@@ -695,12 +736,6 @@ static inline bool ublk_rq_has_data(const struct request *rq)
return bio_has_data(rq->bio);
}
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag)
-{
- return &ubq->io_cmd_buf[tag];
-}
-
static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
{
@@ -1117,18 +1152,12 @@ exit:
blk_mq_end_request(req, res);
}
-static void ublk_complete_rq(struct kref *ref)
+static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
+ int res, unsigned issue_flags)
{
- struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
- ref);
- struct request *req = blk_mq_rq_from_pdu(data);
+ /* read cmd first because req will overwrite it */
+ struct io_uring_cmd *cmd = io->cmd;
- __ublk_complete_rq(req);
-}
-
-static void ubq_complete_io_cmd(struct ublk_io *io, int res,
- unsigned issue_flags)
-{
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
@@ -1138,8 +1167,10 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res,
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+ io->req = req;
+
/* tell ublksrv one io request is coming */
- io_uring_cmd_done(io->cmd, res, 0, issue_flags);
+ io_uring_cmd_done(cmd, res, 0, issue_flags);
}
#define UBLK_REQUEUE_DELAY_MS 3
@@ -1154,16 +1185,91 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
+static void ublk_auto_buf_reg_fallback(struct request *req)
+{
+ const struct ublk_queue *ubq = req->mq_hctx->driver_data;
+ struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
+ refcount_set(&data->ref, 1);
+}
+
+static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
+ unsigned int issue_flags)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd);
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ int ret;
+
+ ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
+ pdu->buf.index, issue_flags);
+ if (ret) {
+ if (pdu->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
+ ublk_auto_buf_reg_fallback(req);
+ return true;
+ }
+ blk_mq_end_request(req, BLK_STS_IOERR);
+ return false;
+ }
+ /* one extra reference is dropped by ublk_io_release */
+ refcount_set(&data->ref, 2);
+
+ data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
+ /* store buffer index in request payload */
+ data->buf_index = pdu->buf.index;
+ io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
+ return true;
+}
+
+static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
+ struct request *req, struct ublk_io *io,
+ unsigned int issue_flags)
+{
+ if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
+ return ublk_auto_buf_reg(req, io, issue_flags);
+
+ ublk_init_req_ref(ubq, req);
+ return true;
+}
+
+static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io)
+{
+ unsigned mapped_bytes = ublk_map_io(ubq, req, io);
+
+ /* partially mapped, update io descriptor */
+ if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
+ /*
+ * Nothing mapped, retry until we succeed.
+ *
+ * We may never succeed in mapping any bytes here because
+ * of OOM. TODO: reserve one buffer with single page pinned
+ * for providing forward progress guarantee.
+ */
+ if (unlikely(!mapped_bytes)) {
+ blk_mq_requeue_request(req, false);
+ blk_mq_delay_kick_requeue_list(req->q,
+ UBLK_REQUEUE_DELAY_MS);
+ return false;
+ }
+
+ ublk_get_iod(ubq, req->tag)->nr_sectors =
+ mapped_bytes >> 9;
+ }
+
+ return true;
+}
+
static void ublk_dispatch_req(struct ublk_queue *ubq,
struct request *req,
unsigned int issue_flags)
{
int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
- unsigned int mapped_bytes;
- pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+ pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
/*
@@ -1183,54 +1289,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
/*
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
- * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
+ * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
* and notify it.
*/
- if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
- io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
- pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
- __func__, io->cmd->cmd_op, ubq->q_id,
- req->tag, io->flags);
- ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
- return;
- }
- /*
- * We have handled UBLK_IO_NEED_GET_DATA command,
- * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
- * do the copy work.
- */
- io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
- /* update iod->addr because ublksrv may have passed a new io buffer */
- ublk_get_iod(ubq, req->tag)->addr = io->addr;
- pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
- ublk_get_iod(ubq, req->tag)->addr);
+ io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+ pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
+ __func__, ubq->q_id, req->tag, io->flags);
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
+ issue_flags);
+ return;
}
- mapped_bytes = ublk_map_io(ubq, req, io);
-
- /* partially mapped, update io descriptor */
- if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
- /*
- * Nothing mapped, retry until we succeed.
- *
- * We may never succeed in mapping any bytes here because
- * of OOM. TODO: reserve one buffer with single page pinned
- * for providing forward progress guarantee.
- */
- if (unlikely(!mapped_bytes)) {
- blk_mq_requeue_request(req, false);
- blk_mq_delay_kick_requeue_list(req->q,
- UBLK_REQUEUE_DELAY_MS);
- return;
- }
-
- ublk_get_iod(ubq, req->tag)->nr_sectors =
- mapped_bytes >> 9;
- }
+ if (!ublk_start_io(ubq, req, io))
+ return;
- ublk_init_req_ref(ubq, req);
- ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
+ if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
}
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
@@ -1590,30 +1664,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
-static void ublk_commit_completion(struct ublk_device *ub,
- const struct ublksrv_io_cmd *ub_cmd)
-{
- u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
- struct ublk_queue *ubq = ublk_get_queue(ub, qid);
- struct ublk_io *io = &ubq->ios[tag];
- struct request *req;
-
- /* now this cmd slot is owned by nbd driver */
- io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
- io->res = ub_cmd->result;
-
- /* find the io request and complete */
- req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
- if (WARN_ON_ONCE(unlikely(!req)))
- return;
-
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
-
- if (likely(!blk_should_fake_timeout(req->q)))
- ublk_put_req_ref(ubq, req);
-}
-
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
@@ -1642,17 +1692,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
- struct request *rq;
-
- /*
- * Either we fail the request or ublk_rq_task_work_cb
- * will do it
- */
- rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq && blk_mq_request_started(rq))
- __ublk_fail_req(ubq, io, rq);
- }
+ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
+ __ublk_fail_req(ubq, io, io->req);
}
}
@@ -1940,6 +1981,20 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
+static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+
+ if (pdu->buf.reserved0 || pdu->buf.reserved1)
+ return -EINVAL;
+
+ if (pdu->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
+ return -EINVAL;
+ return 0;
+}
+
static void ublk_io_release(void *priv)
{
struct request *rq = priv;
@@ -1953,16 +2008,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
unsigned int index, unsigned int issue_flags)
{
struct ublk_device *ub = cmd->file->private_data;
- const struct ublk_io *io = &ubq->ios[tag];
struct request *req;
int ret;
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- return -EINVAL;
-
req = __ublk_check_and_get_req(ub, ubq, tag, 0);
if (!req)
return -EINVAL;
@@ -1978,17 +2029,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
}
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, unsigned int tag,
+ const struct ublk_queue *ubq,
unsigned int index, unsigned int issue_flags)
{
- const struct ublk_io *io = &ubq->ios[tag];
-
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- return -EINVAL;
-
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
@@ -2031,6 +2077,12 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
goto out;
}
+ if (ublk_support_auto_buf_reg(ubq)) {
+ ret = ublk_set_auto_buf_reg(cmd);
+ if (ret)
+ goto out;
+ }
+
ublk_fill_io_cmd(io, cmd, buf_addr);
ublk_mark_io_ready(ub, ubq);
out:
@@ -2038,6 +2090,90 @@ out:
return ret;
}
+static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
+ struct ublk_io *io, struct io_uring_cmd *cmd,
+ const struct ublksrv_io_cmd *ub_cmd,
+ unsigned int issue_flags)
+{
+ struct request *req = io->req;
+
+ if (ublk_need_map_io(ubq)) {
+ /*
+ * COMMIT_AND_FETCH_REQ has to provide IO buffer if
+ * NEED GET DATA is not enabled or it is Read IO.
+ */
+ if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
+ req_op(req) == REQ_OP_READ))
+ return -EINVAL;
+ } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
+ /*
+ * User copy requires addr to be unset when command is
+ * not zone append
+ */
+ return -EINVAL;
+ }
+
+ if (ublk_support_auto_buf_reg(ubq)) {
+ int ret;
+
+ /*
+ * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
+ * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
+ * `io_ring_ctx`.
+ *
+ * If this uring_cmd's io_ring_ctx isn't same with the
+ * one for registering the buffer, it is ublk server's
+ * responsibility for unregistering the buffer, otherwise
+ * this ublk request gets stuck.
+ */
+ if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ if (data->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
+ io_buffer_unregister_bvec(cmd, data->buf_index,
+ issue_flags);
+ io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
+ }
+
+ ret = ublk_set_auto_buf_reg(cmd);
+ if (ret)
+ return ret;
+ }
+
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
+
+ /* now this cmd slot is owned by ublk driver */
+ io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+ io->res = ub_cmd->result;
+
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ req->__sector = ub_cmd->zone_append_lba;
+
+ if (likely(!blk_should_fake_timeout(req->q)))
+ ublk_put_req_ref(ubq, req);
+
+ return 0;
+}
+
+static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
+{
+ struct request *req = io->req;
+
+ /*
+ * We have handled UBLK_IO_NEED_GET_DATA command,
+ * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
+ * do the copy work.
+ */
+ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
+ /* update iod->addr because ublksrv may have passed a new io buffer */
+ ublk_get_iod(ubq, req->tag)->addr = io->addr;
+ pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
+ ublk_get_iod(ubq, req->tag)->addr);
+
+ return ublk_start_io(ubq, req, io);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -2048,7 +2184,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
int ret = -EINVAL;
- struct request *req;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
@@ -2058,9 +2193,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
ubq = ublk_get_queue(ub, ub_cmd->q_id);
- if (!ubq || ub_cmd->q_id != ubq->q_id)
- goto out;
-
if (ubq->ubq_daemon && ubq->ubq_daemon != current)
goto out;
@@ -2075,6 +2207,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
}
+ /* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) &&
+ _IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ)
+ goto out;
+
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
@@ -2092,45 +2229,23 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_UNREGISTER_IO_BUF:
- return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
+ return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
case UBLK_IO_FETCH_REQ:
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
if (ret)
goto out;
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
-
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
-
- if (ublk_need_map_io(ubq)) {
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if
- * NEED GET DATA is not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
- req_op(req) == REQ_OP_READ))
- goto out;
- } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
- /*
- * User copy requires addr to be unset when command is
- * not zone append
- */
- ret = -EINVAL;
+ ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
+ if (ret)
goto out;
- }
-
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- ublk_dispatch_req(ubq, req, issue_flags);
- return -EIOCBQUEUED;
+ io->addr = ub_cmd->addr;
+ if (!ublk_get_data(ubq, io))
+ return -EIOCBQUEUED;
+
+ return UBLK_IO_RES_OK;
default:
goto out;
}
@@ -2728,6 +2843,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
return -EINVAL;
}
+ if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
+ pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
+ return -EINVAL;
+ }
+
/*
* unprivileged device can't be trusted, but RECOVERY and
* RECOVERY_REISSUE still may hang error handling, so can't
@@ -2744,8 +2864,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
* For USER_COPY, we depends on userspace to fill request
* buffer by pwrite() to ublk char device, which can't be
* used for unprivileged device
+ *
+ * Same with zero copy or auto buffer register.
*/
- if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
return -EINVAL;
}
@@ -2803,7 +2926,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
UBLK_F_URING_CMD_COMP_IN_TASK;
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
- if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/*
@@ -3106,6 +3230,127 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
return 0;
}
+static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
+{
+ struct ublk_param_basic *p = &ub->params.basic;
+ u64 new_size = header->data[0];
+
+ mutex_lock(&ub->mutex);
+ p->dev_sectors = new_size;
+ set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
+ mutex_unlock(&ub->mutex);
+}
+
+struct count_busy {
+ const struct ublk_queue *ubq;
+ unsigned int nr_busy;
+};
+
+static bool ublk_count_busy_req(struct request *rq, void *data)
+{
+ struct count_busy *idle = data;
+
+ if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
+ idle->nr_busy += 1;
+ return true;
+}
+
+/* uring_cmd is guaranteed to be active if the associated request is idle */
+static bool ubq_has_idle_io(const struct ublk_queue *ubq)
+{
+ struct count_busy data = {
+ .ubq = ubq,
+ };
+
+ blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
+ return data.nr_busy < ubq->q_depth;
+}
+
+/* Wait until each hw queue has at least one idle IO */
+static int ublk_wait_for_idle_io(struct ublk_device *ub,
+ unsigned int timeout_ms)
+{
+ unsigned int elapsed = 0;
+ int ret;
+
+ while (elapsed < timeout_ms && !signal_pending(current)) {
+ unsigned int queues_cancelable = 0;
+ int i;
+
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ queues_cancelable += !!ubq_has_idle_io(ubq);
+ }
+
+ /*
+ * Each queue needs at least one active command for
+ * notifying ublk server
+ */
+ if (queues_cancelable == ub->dev_info.nr_hw_queues)
+ break;
+
+ msleep(UBLK_REQUEUE_DELAY_MS);
+ elapsed += UBLK_REQUEUE_DELAY_MS;
+ }
+
+ if (signal_pending(current))
+ ret = -EINTR;
+ else if (elapsed >= timeout_ms)
+ ret = -EBUSY;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
+ const struct ublksrv_ctrl_cmd *header)
+{
+ /* zero means wait forever */
+ u64 timeout_ms = header->data[0];
+ struct gendisk *disk;
+ int i, ret = -ENODEV;
+
+ if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&ub->mutex);
+ disk = ublk_get_disk(ub);
+ if (!disk)
+ goto unlock;
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto put_disk;
+
+ ret = 0;
+ /* already in expected state */
+ if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+ goto put_disk;
+
+ /* Mark all queues as canceling */
+ blk_mq_quiesce_queue(disk->queue);
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ ubq->canceling = true;
+ }
+ blk_mq_unquiesce_queue(disk->queue);
+
+ if (!timeout_ms)
+ timeout_ms = UINT_MAX;
+ ret = ublk_wait_for_idle_io(ub, timeout_ms);
+
+put_disk:
+ ublk_put_disk(disk);
+unlock:
+ mutex_unlock(&ub->mutex);
+
+ /* Cancel pending uring_cmd */
+ if (!ret)
+ ublk_cancel_dev(ub);
+ return ret;
+}
+
/*
* All control commands are sent via /dev/ublk-control, so we have to check
* the destination device's permission
@@ -3191,6 +3436,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
case UBLK_CMD_SET_PARAMS:
case UBLK_CMD_START_USER_RECOVERY:
case UBLK_CMD_END_USER_RECOVERY:
+ case UBLK_CMD_UPDATE_SIZE:
+ case UBLK_CMD_QUIESCE_DEV:
mask = MAY_READ | MAY_WRITE;
break;
default:
@@ -3282,6 +3529,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_END_USER_RECOVERY:
ret = ublk_ctrl_end_recovery(ub, header);
break;
+ case UBLK_CMD_UPDATE_SIZE:
+ ublk_ctrl_set_size(ub, header);
+ ret = 0;
+ break;
+ case UBLK_CMD_QUIESCE_DEV:
+ ret = ublk_ctrl_quiesce_dev(ub, header);
+ break;
default:
ret = -EOPNOTSUPP;
break;
@@ -3315,6 +3569,7 @@ static int __init ublk_init(void)
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+ BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
init_waitqueue_head(&ublk_idr_wq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 7cffea01d868..30bca8cb7106 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
- err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL);
+ err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL);
if (err)
goto out;
@@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
vbr->out_hdr.sector = 0;
- err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+ err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
new file mode 100644
index 000000000000..553b1a713ab9
--- /dev/null
+++ b/drivers/block/zloop.c
@@ -0,0 +1,1385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Christoph Hellwig.
+ * Copyright (c) 2025, Western Digital Corporation or its affiliates.
+ *
+ * Zoned Loop Device driver - exports a zoned block device using one file per
+ * zone as backing storage.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/blkzoned.h>
+#include <linux/pagemap.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/mutex.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+
+/*
+ * Options for adding (and removing) a device.
+ */
+enum {
+ ZLOOP_OPT_ERR = 0,
+ ZLOOP_OPT_ID = (1 << 0),
+ ZLOOP_OPT_CAPACITY = (1 << 1),
+ ZLOOP_OPT_ZONE_SIZE = (1 << 2),
+ ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
+ ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
+ ZLOOP_OPT_BASE_DIR = (1 << 5),
+ ZLOOP_OPT_NR_QUEUES = (1 << 6),
+ ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
+ ZLOOP_OPT_BUFFERED_IO = (1 << 8),
+};
+
+static const match_table_t zloop_opt_tokens = {
+ { ZLOOP_OPT_ID, "id=%d" },
+ { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
+ { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
+ { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
+ { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
+ { ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
+ { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
+ { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
+ { ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
+ { ZLOOP_OPT_ERR, NULL }
+};
+
+/* Default values for the "add" operation. */
+#define ZLOOP_DEF_ID -1
+#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
+#define ZLOOP_DEF_NR_ZONES 64
+#define ZLOOP_DEF_NR_CONV_ZONES 8
+#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
+#define ZLOOP_DEF_NR_QUEUES 1
+#define ZLOOP_DEF_QUEUE_DEPTH 128
+#define ZLOOP_DEF_BUFFERED_IO false
+
+/* Arbitrary limit on the zone size (16GB). */
+#define ZLOOP_MAX_ZONE_SIZE_MB 16384
+
+struct zloop_options {
+ unsigned int mask;
+ int id;
+ sector_t capacity;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_conv_zones;
+ char *base_dir;
+ unsigned int nr_queues;
+ unsigned int queue_depth;
+ bool buffered_io;
+};
+
+/*
+ * Device states.
+ */
+enum {
+ Zlo_creating = 0,
+ Zlo_live,
+ Zlo_deleting,
+};
+
+enum zloop_zone_flags {
+ ZLOOP_ZONE_CONV = 0,
+ ZLOOP_ZONE_SEQ_ERROR,
+};
+
+struct zloop_zone {
+ struct file *file;
+
+ unsigned long flags;
+ struct mutex lock;
+ enum blk_zone_cond cond;
+ sector_t start;
+ sector_t wp;
+
+ gfp_t old_gfp_mask;
+};
+
+struct zloop_device {
+ unsigned int id;
+ unsigned int state;
+
+ struct blk_mq_tag_set tag_set;
+ struct gendisk *disk;
+
+ struct workqueue_struct *workqueue;
+ bool buffered_io;
+
+ const char *base_dir;
+ struct file *data_dir;
+
+ unsigned int zone_shift;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_zones;
+ unsigned int nr_conv_zones;
+ unsigned int block_size;
+
+ struct zloop_zone zones[] __counted_by(nr_zones);
+};
+
+struct zloop_cmd {
+ struct work_struct work;
+ atomic_t ref;
+ sector_t sector;
+ sector_t nr_sectors;
+ long ret;
+ struct kiocb iocb;
+ struct bio_vec *bvec;
+};
+
+static DEFINE_IDR(zloop_index_idr);
+static DEFINE_MUTEX(zloop_ctl_mutex);
+
+static unsigned int rq_zone_no(struct request *rq)
+{
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ return blk_rq_pos(rq) >> zlo->zone_shift;
+}
+
+static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ lockdep_assert_held(&zone->lock);
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat (err=%d)\n",
+ zone_no, ret);
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ return ret;
+ }
+
+ file_sectors = stat.size >> SECTOR_SHIFT;
+ if (file_sectors > zlo->zone_capacity) {
+ pr_err("Zone %u file too large (%llu sectors > %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return -EINVAL;
+ }
+
+ if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone %u file size not aligned to block size %u\n",
+ zone_no, zlo->block_size);
+ return -EINVAL;
+ }
+
+ if (!file_sectors) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ } else if (file_sectors == zlo->zone_capacity) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ } else {
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ zone->wp = zone->start + file_sectors;
+ }
+
+ return 0;
+}
+
+static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EXP_OPEN:
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_IMP_OPEN:
+ zone->cond = BLK_ZONE_COND_EXP_OPEN;
+ break;
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_CLOSED:
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ if (zone->wp == zone->start)
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ else
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, 0)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_all_zones(struct zloop_device *zlo)
+{
+ unsigned int i;
+ int ret;
+
+ for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
+ ret = zloop_reset_zone(zlo, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_FULL)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+ unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static void zloop_put_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+
+ if (!atomic_dec_and_test(&cmd->ref))
+ return;
+ kfree(cmd->bvec);
+ cmd->bvec = NULL;
+ if (likely(!blk_should_fake_timeout(rq->q)))
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_rw_complete(struct kiocb *iocb, long ret)
+{
+ struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
+
+ cmd->ret = ret;
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_rw(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = rq_zone_no(rq);
+ sector_t sector = blk_rq_pos(rq);
+ sector_t nr_sectors = blk_rq_sectors(rq);
+ bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
+ bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
+ int rw = is_write ? ITER_SOURCE : ITER_DEST;
+ struct req_iterator rq_iter;
+ struct zloop_zone *zone;
+ struct iov_iter iter;
+ struct bio_vec tmp;
+ sector_t zone_end;
+ int nr_bvec = 0;
+ int ret;
+
+ atomic_set(&cmd->ref, 2);
+ cmd->sector = sector;
+ cmd->nr_sectors = nr_sectors;
+ cmd->ret = 0;
+
+ /* We should never get an I/O beyond the device capacity. */
+ if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
+ ret = -EIO;
+ goto out;
+ }
+ zone = &zlo->zones[zone_no];
+ zone_end = zone->start + zlo->zone_capacity;
+
+ /*
+ * The block layer should never send requests that are not fully
+ * contained within the zone.
+ */
+ if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+ if (ret)
+ goto out;
+ }
+
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
+ mutex_lock(&zone->lock);
+
+ if (is_append) {
+ sector = zone->wp;
+ cmd->sector = sector;
+ }
+
+ /*
+ * Write operations must be aligned to the write pointer and
+ * fully contained within the zone capacity.
+ */
+ if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
+ pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
+ zone_no, sector, zone->wp);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /* Implicitly open the target zone. */
+ if (zone->cond == BLK_ZONE_COND_CLOSED ||
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ /*
+ * Advance the write pointer of sequential zones. If the write
+ * fails, the wp position will be corrected when the next I/O
+ * copmpletes.
+ */
+ zone->wp += nr_sectors;
+ if (zone->wp == zone_end)
+ zone->cond = BLK_ZONE_COND_FULL;
+ }
+
+ rq_for_each_bvec(tmp, rq, rq_iter)
+ nr_bvec++;
+
+ if (rq->bio != rq->biotail) {
+ struct bio_vec *bvec;
+
+ cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
+ if (!cmd->bvec) {
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /*
+ * The bios of the request may be started from the middle of
+ * the 'bvec' because of bio splitting, so we can't directly
+ * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
+ * API will take care of all details for us.
+ */
+ bvec = cmd->bvec;
+ rq_for_each_bvec(tmp, rq, rq_iter) {
+ *bvec = tmp;
+ bvec++;
+ }
+ iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
+ } else {
+ /*
+ * Same here, this bio may be started from the middle of the
+ * 'bvec' because of bio splitting, so offset from the bvec
+ * must be passed to iov iterator
+ */
+ iov_iter_bvec(&iter, rw,
+ __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
+ nr_bvec, blk_rq_bytes(rq));
+ iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
+ }
+
+ cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
+ cmd->iocb.ki_filp = zone->file;
+ cmd->iocb.ki_complete = zloop_rw_complete;
+ if (!zlo->buffered_io)
+ cmd->iocb.ki_flags = IOCB_DIRECT;
+ cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
+ if (rw == ITER_SOURCE)
+ ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
+ else
+ ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
+unlock:
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
+ mutex_unlock(&zone->lock);
+out:
+ if (ret != -EIOCBQUEUED)
+ zloop_rw_complete(&cmd->iocb, ret);
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_handle_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ /*
+ * zloop_rw() always executes asynchronously or completes
+ * directly.
+ */
+ zloop_rw(cmd);
+ return;
+ case REQ_OP_FLUSH:
+ /*
+ * Sync the entire FS containing the zone files instead of
+ * walking all files
+ */
+ cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
+ break;
+ case REQ_OP_ZONE_RESET:
+ cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ cmd->ret = zloop_reset_all_zones(zlo);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_OPEN:
+ cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ pr_err("Unsupported operation %d\n", req_op(rq));
+ cmd->ret = -EOPNOTSUPP;
+ break;
+ }
+
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_cmd_workfn(struct work_struct *work)
+{
+ struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
+ int orig_flags = current->flags;
+
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+ zloop_handle_cmd(cmd);
+ current->flags = orig_flags;
+}
+
+static void zloop_complete_rq(struct request *rq)
+{
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = cmd->sector >> zlo->zone_shift;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ blk_status_t sts = BLK_STS_OK;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
+ zone_no, cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ /* short read */
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
+ }
+ break;
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
+ zone_no,
+ req_op(rq) == REQ_OP_WRITE ? "" : "append ",
+ cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ pr_err("Zone %u: partial write %ld/%u B\n",
+ zone_no, cmd->ret, blk_rq_bytes(rq));
+ cmd->ret = -EIO;
+ }
+
+ if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ /*
+ * A write to a sequential zone file failed: mark the
+ * zone as having an error. This will be corrected and
+ * cleared when the next IO is submitted.
+ */
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ break;
+ }
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ rq->__sector = cmd->sector;
+
+ break;
+ default:
+ break;
+ }
+
+ if (cmd->ret < 0)
+ sts = errno_to_blk_status(cmd->ret);
+ blk_mq_end_request(rq, sts);
+}
+
+static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ if (zlo->state == Zlo_deleting)
+ return BLK_STS_IOERR;
+
+ blk_mq_start_request(rq);
+
+ INIT_WORK(&cmd->work, zloop_cmd_workfn);
+ queue_work(zlo->workqueue, &cmd->work);
+
+ return BLK_STS_OK;
+}
+
+static const struct blk_mq_ops zloop_mq_ops = {
+ .queue_rq = zloop_queue_rq,
+ .complete = zloop_complete_rq,
+};
+
+static int zloop_open(struct gendisk *disk, blk_mode_t mode)
+{
+ struct zloop_device *zlo = disk->private_data;
+ int ret;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ if (zlo->state != Zlo_live)
+ ret = -ENXIO;
+ mutex_unlock(&zloop_ctl_mutex);
+ return ret;
+}
+
+static int zloop_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct zloop_device *zlo = disk->private_data;
+ struct blk_zone blkz = {};
+ unsigned int first, i;
+ int ret;
+
+ first = disk_zone_no(disk, sector);
+ if (first >= zlo->nr_zones)
+ return 0;
+ nr_zones = min(nr_zones, zlo->nr_zones - first);
+
+ for (i = 0; i < nr_zones; i++) {
+ unsigned int zone_no = first + i;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret) {
+ mutex_unlock(&zone->lock);
+ return ret;
+ }
+ }
+
+ blkz.start = zone->start;
+ blkz.len = zlo->zone_size;
+ blkz.wp = zone->wp;
+ blkz.cond = zone->cond;
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ blkz.capacity = zlo->zone_size;
+ } else {
+ blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ blkz.capacity = zlo->zone_capacity;
+ }
+
+ mutex_unlock(&zone->lock);
+
+ ret = cb(&blkz, i, data);
+ if (ret)
+ return ret;
+ }
+
+ return nr_zones;
+}
+
+static void zloop_free_disk(struct gendisk *disk)
+{
+ struct zloop_device *zlo = disk->private_data;
+ unsigned int i;
+
+ for (i = 0; i < zlo->nr_zones; i++) {
+ struct zloop_zone *zone = &zlo->zones[i];
+
+ mapping_set_gfp_mask(zone->file->f_mapping,
+ zone->old_gfp_mask);
+ fput(zone->file);
+ }
+
+ fput(zlo->data_dir);
+ destroy_workqueue(zlo->workqueue);
+ kfree(zlo->base_dir);
+ kvfree(zlo);
+}
+
+static const struct block_device_operations zloop_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_open,
+ .report_zones = zloop_report_zones,
+ .free_disk = zloop_free_disk,
+};
+
+__printf(3, 4)
+static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
+ const char *fmt, ...)
+{
+ struct file *file;
+ va_list ap;
+ char *p;
+
+ va_start(ap, fmt);
+ p = kvasprintf(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+ file = filp_open(p, oflags, mode);
+ kfree(p);
+ return file;
+}
+
+static int zloop_get_block_size(struct zloop_device *zlo,
+ struct zloop_zone *zone)
+{
+ struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
+ struct kstat st;
+
+ /*
+ * If the FS block size is lower than or equal to 4K, use that as the
+ * device block size. Otherwise, fallback to the FS direct IO alignment
+ * constraint if that is provided, and to the FS underlying device
+ * physical block size if the direct IO alignment is unknown.
+ */
+ if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
+ zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
+ else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
+ (st.result_mask & STATX_DIOALIGN))
+ zlo->block_size = st.dio_offset_align;
+ else if (sb_bdev)
+ zlo->block_size = bdev_physical_block_size(sb_bdev);
+ else
+ zlo->block_size = SECTOR_SIZE;
+
+ if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone capacity is not aligned to block size %u\n",
+ zlo->block_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
+ unsigned int zone_no, bool restore)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int oflags = O_RDWR;
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ mutex_init(&zone->lock);
+ zone->start = (sector_t)zone_no << zlo->zone_shift;
+
+ if (!restore)
+ oflags |= O_CREAT;
+
+ if (!opts->buffered_io)
+ oflags |= O_DIRECT;
+
+ if (zone_no < zlo->nr_conv_zones) {
+ /* Conventional zone file. */
+ set_bit(ZLOOP_ZONE_CONV, &zone->flags);
+ zone->cond = BLK_ZONE_COND_NOT_WP;
+ zone->wp = U64_MAX;
+
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat\n", zone_no);
+ return ret;
+ }
+ file_sectors = stat.size >> SECTOR_SHIFT;
+
+ if (restore && file_sectors != zlo->zone_size) {
+ pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return ret;
+ }
+
+ ret = vfs_truncate(&zone->file->f_path,
+ zlo->zone_size << SECTOR_SHIFT);
+ if (ret < 0) {
+ pr_err("Failed to truncate zone %u file (err=%d)\n",
+ zone_no, ret);
+ return ret;
+ }
+
+ return 0;
+ }
+
+ /* Sequential zone file. */
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ zloop_get_block_size(zlo, zone);
+
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static bool zloop_dev_exists(struct zloop_device *zlo)
+{
+ struct file *cnv, *seq;
+ bool exists;
+
+ cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, 0);
+ seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, 0);
+ exists = !IS_ERR(cnv) || !IS_ERR(seq);
+
+ if (!IS_ERR(cnv))
+ fput(cnv);
+ if (!IS_ERR(seq))
+ fput(seq);
+
+ return exists;
+}
+
+static int zloop_ctl_add(struct zloop_options *opts)
+{
+ struct queue_limits lim = {
+ .max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
+ .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
+ .chunk_sectors = opts->zone_size,
+ .features = BLK_FEAT_ZONED,
+ };
+ unsigned int nr_zones, i, j;
+ struct zloop_device *zlo;
+ int ret = -EINVAL;
+ bool restore;
+
+ __module_get(THIS_MODULE);
+
+ nr_zones = opts->capacity >> ilog2(opts->zone_size);
+ if (opts->nr_conv_zones >= nr_zones) {
+ pr_err("Invalid number of conventional zones %u\n",
+ opts->nr_conv_zones);
+ goto out;
+ }
+
+ zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
+ if (!zlo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ zlo->state = Zlo_creating;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ goto out_free_dev;
+
+ /* Allocate id, if @opts->id >= 0, we're requesting that specific id */
+ if (opts->id >= 0) {
+ ret = idr_alloc(&zloop_index_idr, zlo,
+ opts->id, opts->id + 1, GFP_KERNEL);
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ } else {
+ ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
+ }
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret < 0)
+ goto out_free_dev;
+
+ zlo->id = ret;
+ zlo->zone_shift = ilog2(opts->zone_size);
+ zlo->zone_size = opts->zone_size;
+ if (opts->zone_capacity)
+ zlo->zone_capacity = opts->zone_capacity;
+ else
+ zlo->zone_capacity = zlo->zone_size;
+ zlo->nr_zones = nr_zones;
+ zlo->nr_conv_zones = opts->nr_conv_zones;
+ zlo->buffered_io = opts->buffered_io;
+
+ zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
+ opts->nr_queues * opts->queue_depth, zlo->id);
+ if (!zlo->workqueue) {
+ ret = -ENOMEM;
+ goto out_free_idr;
+ }
+
+ if (opts->base_dir)
+ zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
+ else
+ zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
+ if (!zlo->base_dir) {
+ ret = -ENOMEM;
+ goto out_destroy_workqueue;
+ }
+
+ zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
+ zlo->base_dir, zlo->id);
+ if (IS_ERR(zlo->data_dir)) {
+ ret = PTR_ERR(zlo->data_dir);
+ pr_warn("Failed to open directory %s/%u (err=%d)\n",
+ zlo->base_dir, zlo->id, ret);
+ goto out_free_base_dir;
+ }
+
+ /*
+ * If we already have zone files, we are restoring a device created by a
+ * previous add operation. In this case, zloop_init_zone() will check
+ * that the zone files are consistent with the zone configuration given.
+ */
+ restore = zloop_dev_exists(zlo);
+ for (i = 0; i < nr_zones; i++) {
+ ret = zloop_init_zone(zlo, opts, i, restore);
+ if (ret)
+ goto out_close_files;
+ }
+
+ lim.physical_block_size = zlo->block_size;
+ lim.logical_block_size = zlo->block_size;
+
+ zlo->tag_set.ops = &zloop_mq_ops;
+ zlo->tag_set.nr_hw_queues = opts->nr_queues;
+ zlo->tag_set.queue_depth = opts->queue_depth;
+ zlo->tag_set.numa_node = NUMA_NO_NODE;
+ zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
+ zlo->tag_set.driver_data = zlo;
+
+ ret = blk_mq_alloc_tag_set(&zlo->tag_set);
+ if (ret) {
+ pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
+ goto out_close_files;
+ }
+
+ zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
+ if (IS_ERR(zlo->disk)) {
+ pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
+ ret = PTR_ERR(zlo->disk);
+ goto out_cleanup_tags;
+ }
+ zlo->disk->flags = GENHD_FL_NO_PART;
+ zlo->disk->fops = &zloop_fops;
+ zlo->disk->private_data = zlo;
+ sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
+ set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
+
+ ret = blk_revalidate_disk_zones(zlo->disk);
+ if (ret)
+ goto out_cleanup_disk;
+
+ ret = add_disk(zlo->disk);
+ if (ret) {
+ pr_err("add_disk failed (err=%d)\n", ret);
+ goto out_cleanup_disk;
+ }
+
+ mutex_lock(&zloop_ctl_mutex);
+ zlo->state = Zlo_live;
+ mutex_unlock(&zloop_ctl_mutex);
+
+ pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
+ zlo->id, zlo->nr_zones,
+ ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
+ zlo->block_size);
+
+ return 0;
+
+out_cleanup_disk:
+ put_disk(zlo->disk);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&zlo->tag_set);
+out_close_files:
+ for (j = 0; j < i; j++) {
+ struct zloop_zone *zone = &zlo->zones[j];
+
+ if (!IS_ERR_OR_NULL(zone->file))
+ fput(zone->file);
+ }
+ fput(zlo->data_dir);
+out_free_base_dir:
+ kfree(zlo->base_dir);
+out_destroy_workqueue:
+ destroy_workqueue(zlo->workqueue);
+out_free_idr:
+ mutex_lock(&zloop_ctl_mutex);
+ idr_remove(&zloop_index_idr, zlo->id);
+ mutex_unlock(&zloop_ctl_mutex);
+out_free_dev:
+ kvfree(zlo);
+out:
+ module_put(THIS_MODULE);
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ret;
+}
+
+static int zloop_ctl_remove(struct zloop_options *opts)
+{
+ struct zloop_device *zlo;
+ int ret;
+
+ if (!(opts->mask & ZLOOP_OPT_ID)) {
+ pr_err("No ID specified\n");
+ return -EINVAL;
+ }
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ zlo = idr_find(&zloop_index_idr, opts->id);
+ if (!zlo || zlo->state == Zlo_creating) {
+ ret = -ENODEV;
+ } else if (zlo->state == Zlo_deleting) {
+ ret = -EINVAL;
+ } else {
+ idr_remove(&zloop_index_idr, zlo->id);
+ zlo->state = Zlo_deleting;
+ }
+
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ del_gendisk(zlo->disk);
+ put_disk(zlo->disk);
+ blk_mq_free_tag_set(&zlo->tag_set);
+
+ pr_info("Removed device %d\n", opts->id);
+
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static int zloop_parse_options(struct zloop_options *opts, const char *buf)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *options, *o, *p;
+ unsigned int token;
+ int ret = 0;
+
+ /* Set defaults. */
+ opts->mask = 0;
+ opts->id = ZLOOP_DEF_ID;
+ opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
+ opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
+ opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
+ opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
+ opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
+ opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
+
+ if (!buf)
+ return 0;
+
+ /* Skip leading spaces before the options. */
+ while (isspace(*buf))
+ buf++;
+
+ options = o = kstrdup(buf, GFP_KERNEL);
+ if (!options)
+ return -ENOMEM;
+
+ /* Parse the options, doing only some light invalid value checks. */
+ while ((p = strsep(&o, ",\n")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, zloop_opt_tokens, args);
+ opts->mask |= token;
+ switch (token) {
+ case ZLOOP_OPT_ID:
+ if (match_int(args, &opts->id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case ZLOOP_OPT_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_SIZE:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
+ !is_power_of_2(token)) {
+ pr_err("Invalid zone size %u\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_size =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid zone capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_NR_CONV_ZONES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_conv_zones = token;
+ break;
+ case ZLOOP_OPT_BASE_DIR:
+ p = match_strdup(args);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ kfree(opts->base_dir);
+ opts->base_dir = p;
+ break;
+ case ZLOOP_OPT_NR_QUEUES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid number of queues\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_queues = min(token, num_online_cpus());
+ break;
+ case ZLOOP_OPT_QUEUE_DEPTH:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid queue depth\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->queue_depth = token;
+ break;
+ case ZLOOP_OPT_BUFFERED_IO:
+ opts->buffered_io = true;
+ break;
+ case ZLOOP_OPT_ERR:
+ default:
+ pr_warn("unknown parameter or missing value '%s'\n", p);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = -EINVAL;
+ if (opts->capacity <= opts->zone_size) {
+ pr_err("Invalid capacity\n");
+ goto out;
+ }
+
+ if (opts->zone_capacity > opts->zone_size) {
+ pr_err("Invalid zone capacity\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kfree(options);
+ return ret;
+}
+
+enum {
+ ZLOOP_CTL_ADD,
+ ZLOOP_CTL_REMOVE,
+};
+
+static struct zloop_ctl_op {
+ int code;
+ const char *name;
+} zloop_ctl_ops[] = {
+ { ZLOOP_CTL_ADD, "add" },
+ { ZLOOP_CTL_REMOVE, "remove" },
+ { -1, NULL },
+};
+
+static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *pos)
+{
+ struct zloop_options opts = { };
+ struct zloop_ctl_op *op;
+ const char *buf, *opts_buf;
+ int i, ret;
+
+ if (count > PAGE_SIZE)
+ return -ENOMEM;
+
+ buf = memdup_user_nul(ubuf, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
+ op = &zloop_ctl_ops[i];
+ if (!op->name) {
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!strncmp(buf, op->name, strlen(op->name)))
+ break;
+ }
+
+ if (count <= strlen(op->name))
+ opts_buf = NULL;
+ else
+ opts_buf = buf + strlen(op->name);
+
+ ret = zloop_parse_options(&opts, opts_buf);
+ if (ret) {
+ pr_err("Failed to parse options\n");
+ goto out;
+ }
+
+ switch (op->code) {
+ case ZLOOP_CTL_ADD:
+ ret = zloop_ctl_add(&opts);
+ break;
+ case ZLOOP_CTL_REMOVE:
+ ret = zloop_ctl_remove(&opts);
+ break;
+ default:
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ kfree(opts.base_dir);
+ kfree(buf);
+ return ret ? ret : count;
+}
+
+static int zloop_ctl_show(struct seq_file *seq_file, void *private)
+{
+ const struct match_token *tok;
+ int i;
+
+ /* Add operation */
+ seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
+ for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
+ tok = &zloop_opt_tokens[i];
+ if (!tok->pattern)
+ break;
+ if (i)
+ seq_putc(seq_file, ',');
+ seq_puts(seq_file, tok->pattern);
+ }
+ seq_putc(seq_file, '\n');
+
+ /* Remove operation */
+ seq_puts(seq_file, zloop_ctl_ops[1].name);
+ seq_puts(seq_file, " id=%d\n");
+
+ return 0;
+}
+
+static int zloop_ctl_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return single_open(file, zloop_ctl_show, NULL);
+}
+
+static int zloop_ctl_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+static const struct file_operations zloop_ctl_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_ctl_open,
+ .release = zloop_ctl_release,
+ .write = zloop_ctl_write,
+ .read = seq_read,
+};
+
+static struct miscdevice zloop_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "zloop-control",
+ .fops = &zloop_ctl_fops,
+};
+
+static int __init zloop_init(void)
+{
+ int ret;
+
+ ret = misc_register(&zloop_misc);
+ if (ret) {
+ pr_err("Failed to register misc device: %d\n", ret);
+ return ret;
+ }
+ pr_info("Module loaded\n");
+
+ return 0;
+}
+
+static void __exit zloop_exit(void)
+{
+ misc_deregister(&zloop_misc);
+ idr_destroy(&zloop_index_idr);
+}
+
+module_init(zloop_init);
+module_exit(zloop_exit);
+
+MODULE_DESCRIPTION("Zoned loopback device");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index b163e043c687..21a10552da61 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -3677,8 +3677,7 @@ static void cdrom_sysctl_register(void)
static void cdrom_sysctl_unregister(void)
{
- if (cdrom_sysctl_header)
- unregister_sysctl_table(cdrom_sysctl_header);
+ unregister_sysctl_table(cdrom_sysctl_header);
}
#else /* CONFIG_SYSCTL */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 813b38aec3e4..c40db9c161c1 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- __bio_add_page(bio, virt_to_page(out), SB_SIZE,
- offset_in_page(out));
+ bio_add_virt_nofail(bio, out, SB_SIZE);
out->offset = cpu_to_le64(sb->offset);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index f0b5a6931161..d098e75e3461 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1364,7 +1364,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
ptr = (char *)b->data + offset;
len = n_sectors << SECTOR_SHIFT;
- __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
+ bio_add_virt_nofail(bio, ptr, len);
submit_bio(bio);
}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index cc3d3897ef42..1f626066e8cc 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2557,14 +2557,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
char *mem;
outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
-
- r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
- if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
- bio_put(outgoing_bio);
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return;
- }
+ bio_add_virt_nofail(outgoing_bio, outgoing_data,
+ ic->sectors_per_block << SECTOR_SHIFT);
bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
if (IS_ERR(bip)) {
@@ -3211,7 +3205,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
r = submit_bio_wait(bio);
bio_put(bio);
if (unlikely(r)) {
@@ -3228,7 +3223,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
if (unlikely(IS_ERR(bip))) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6adc55fd90d3..127138c61be5 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -14,6 +14,7 @@
#include "raid5.h"
#include "raid10.h"
#include "md-bitmap.h"
+#include "dm-core.h"
#include <linux/device-mapper.h>
@@ -3308,6 +3309,7 @@ size_check:
/* Disable/enable discard support on raid set. */
configure_discard_support(rs);
+ rs->md.dm_gendisk = ti->table->md->disk;
mddev_unlock(&rs->md);
return 0;
@@ -3327,6 +3329,7 @@ static void raid_dtr(struct dm_target *ti)
mddev_lock_nointr(&rs->md);
md_stop(&rs->md);
+ rs->md.dm_gendisk = NULL;
mddev_unlock(&rs->md);
if (work_pending(&rs->md.event_work))
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9daa78c5fe33..0fde115e921f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
- * is 1000 KB/sec, so the extra system load does not show up that much.
- * Increase it if you want to have more _guaranteed_ speed. Note that
- * the RAID driver will use the maximum available bandwidth if the IO
- * subsystem is idle. There is also an 'absolute maximum' reconstruction
- * speed limit - in case reconstruction slows down your system despite
- * idle IO detection.
+ * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
+ * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
+ * does not show up that much. Increase it if you want to have more guaranteed
+ * speed. Note that the RAID driver will use the maximum bandwidth
+ * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
*
- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
- * or /sys/block/mdX/md/sync_speed_{min,max}
+ * Background sync IO speed control:
+ *
+ * - below speed min:
+ * no limit;
+ * - above speed min and below speed max:
+ * a) if mddev is idle, then no limit;
+ * b) if mddev is busy handling normal IO, then limit inflight sync IO
+ * to sync_io_depth;
+ * - above speed max:
+ * sync IO can't be issued;
+ *
+ * Following configurations can be changed via /proc/sys/dev/raid/ for system
+ * or /sys/block/mdX/md/ for one array.
*/
-
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(struct mddev *mddev)
+static int sysctl_sync_io_depth = 32;
+
+static int speed_min(struct mddev *mddev)
{
return mddev->sync_speed_min ?
mddev->sync_speed_min : sysctl_speed_limit_min;
}
-static inline int speed_max(struct mddev *mddev)
+static int speed_max(struct mddev *mddev)
{
return mddev->sync_speed_max ?
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int sync_io_depth(struct mddev *mddev)
+{
+ return mddev->sync_io_depth ?
+ mddev->sync_io_depth : sysctl_sync_io_depth;
+}
+
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
@@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_io_depth",
+ .data = &sysctl_sync_io_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
};
@@ -5091,7 +5114,7 @@ static ssize_t
sync_min_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_min(mddev),
- mddev->sync_speed_min ? "local": "system");
+ mddev->sync_speed_min ? "local" : "system");
}
static ssize_t
@@ -5100,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int min;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
min = 0;
} else {
rv = kstrtouint(buf, 10, &min);
@@ -5120,7 +5143,7 @@ static ssize_t
sync_max_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_max(mddev),
- mddev->sync_speed_max ? "local": "system");
+ mddev->sync_speed_max ? "local" : "system");
}
static ssize_t
@@ -5129,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int max;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
max = 0;
} else {
rv = kstrtouint(buf, 10, &max);
@@ -5146,6 +5169,35 @@ static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
static ssize_t
+sync_io_depth_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
+ mddev->sync_io_depth ? "local" : "system");
+}
+
+static ssize_t
+sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int max;
+ int rv;
+
+ if (strncmp(buf, "system", 6) == 0) {
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
+ }
+ mddev->sync_io_depth = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_io_depth =
+__ATTR_RW(sync_io_depth);
+
+static ssize_t
degraded_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d\n", mddev->degraded);
@@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
+ &md_sync_io_depth.attr,
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
@@ -8572,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev)
put_cluster_ops(mddev);
}
-static int is_mddev_idle(struct mddev *mddev, int init)
+static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
{
+ unsigned long last_events = rdev->last_events;
+
+ if (!bdev_is_partition(rdev->bdev))
+ return true;
+
+ /*
+ * If rdev is partition, and user doesn't issue IO to the array, the
+ * array is still not idle if user issues IO to other partitions.
+ */
+ rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
+ sectors) -
+ part_stat_read_accum(rdev->bdev, sectors);
+
+ return init || rdev->last_events <= last_events;
+}
+
+/*
+ * mddev is idle if following conditions are matched since last check:
+ * 1) mddev doesn't have normal IO completed;
+ * 2) mddev doesn't have inflight normal IO;
+ * 3) if any member disk is partition, and other partitions don't have IO
+ * completed;
+ *
+ * Noted this checking rely on IO accounting is enabled.
+ */
+static bool is_mddev_idle(struct mddev *mddev, int init)
+{
+ unsigned long last_events = mddev->normal_io_events;
+ struct gendisk *disk;
struct md_rdev *rdev;
- int idle;
- int curr_events;
+ bool idle = true;
- idle = 1;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_disk;
+ disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
+ if (!disk)
+ return true;
- if (!init && !blk_queue_io_stat(disk->queue))
- continue;
+ mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
+ if (!init && (mddev->normal_io_events > last_events ||
+ bdev_count_inflight(disk->part0)))
+ idle = false;
- curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
- atomic_read(&disk->sync_io);
- /* sync IO will cause sync_io to increase before the disk_stats
- * as sync_io is counted when a request starts, and
- * disk_stats is counted when it completes.
- * So resync activity will cause curr_events to be smaller than
- * when there was no such activity.
- * non-sync IO will cause disk_stat to increase without
- * increasing sync_io so curr_events will (eventually)
- * be larger than it was before. Once it becomes
- * substantially larger, the test below will cause
- * the array to appear non-idle, and resync will slow
- * down.
- * If there is a lot of outstanding resync activity when
- * we set last_event to curr_events, then all that activity
- * completing might cause the array to appear non-idle
- * and resync will be slowed down even though there might
- * not have been non-resync activity. This will only
- * happen once though. 'last_events' will soon reflect
- * the state where there is little or no outstanding
- * resync requests, and further resync activity will
- * always make curr_events less than last_events.
- *
- */
- if (init || curr_events - rdev->last_events > 64) {
- rdev->last_events = curr_events;
- idle = 0;
- }
- }
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (!is_rdev_holder_idle(rdev, init))
+ idle = false;
rcu_read_unlock();
+
return idle;
}
@@ -8927,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
}
}
+static bool sync_io_within_limit(struct mddev *mddev)
+{
+ int io_sectors;
+
+ /*
+ * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
+ * RESYNC_PAGES(64k) per IO.
+ */
+ if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
+ io_sectors = 8;
+ else
+ io_sectors = 128;
+
+ return atomic_read(&mddev->recovery_active) <
+ io_sectors * sync_io_depth(mddev);
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -9195,7 +9270,8 @@ void md_do_sync(struct md_thread *thread)
msleep(500);
goto repeat;
}
- if (!is_mddev_idle(mddev, 0)) {
+ if (!sync_io_within_limit(mddev) &&
+ !is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1cf00a04bcdd..d45a9e6ead80 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -132,7 +132,7 @@ struct md_rdev {
sector_t sectors; /* Device size (in 512bytes sectors) */
struct mddev *mddev; /* RAID array if running */
- int last_events; /* IO event timestamp */
+ unsigned long last_events; /* IO event timestamp */
/*
* If meta_bdev is non-NULL, it means that a separate device is
@@ -404,7 +404,8 @@ struct mddev {
* are happening, so run/
* takeover/stop are not safe
*/
- struct gendisk *gendisk;
+ struct gendisk *gendisk; /* mdraid gendisk */
+ struct gendisk *dm_gendisk; /* dm-raid gendisk */
struct kobject kobj;
int hold_active;
@@ -483,6 +484,7 @@ struct mddev {
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
+ int sync_io_depth;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
@@ -518,6 +520,7 @@ struct mddev {
* adding a spare
*/
+ unsigned long normal_io_events; /* IO event timestamp */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
@@ -714,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev)
}
extern void mddev_unlock(struct mddev *mddev);
-static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-{
- if (blk_queue_io_stat(bdev->bd_disk->queue))
- atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
-}
-
-static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
-{
- md_sync_acct(bio->bi_bdev, nr_sectors);
-}
-
struct md_personality
{
struct md_submodule_head head;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index de9bccbe7337..657d481525be 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
wbio->bi_end_io = end_sync_write;
atomic_inc(&r1_bio->remaining);
- md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
@@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
read_targets--;
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
@@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
atomic_set(&r1_bio->remaining, 1);
bio = r1_bio->bios[r1_bio->read_disk];
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ba32bac975b8..dce06bf65016 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
tbio->bi_opf |= MD_FAILFAST;
@@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(tbio));
submit_bio_noacct(tbio);
}
@@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(wbio2));
submit_bio_noacct(wbio2);
}
}
@@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
- md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
submit_bio_noacct(bio);
}
@@ -4880,7 +4873,6 @@ read_more:
r10_bio->sectors = nr_sectors;
/* Now submit the read */
- md_sync_acct_bio(read_bio, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
submit_bio_noacct(read_bio);
@@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
atomic_inc(&rdev->nr_pending);
- md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
submit_bio_noacct(b);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6389383166c0..ca5b0e8ba707 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1240,10 +1240,6 @@ again:
}
if (rdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
@@ -1300,10 +1296,6 @@ again:
submit_bio_noacct(bi);
}
if (rrdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);
diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c
index 2c092ec8c0a9..3b6d759bcdf2 100644
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -242,7 +242,7 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
{
const char *hmac_name;
struct crypto_shash *key_tfm;
- struct shash_desc *shash;
+ SHASH_DESC_ON_STACK(shash, key_tfm);
struct nvme_dhchap_key *transformed_key;
int ret, key_len;
@@ -267,19 +267,11 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
if (IS_ERR(key_tfm))
return ERR_CAST(key_tfm);
- shash = kmalloc(sizeof(struct shash_desc) +
- crypto_shash_descsize(key_tfm),
- GFP_KERNEL);
- if (!shash) {
- ret = -ENOMEM;
- goto out_free_key;
- }
-
key_len = crypto_shash_digestsize(key_tfm);
transformed_key = nvme_auth_alloc_key(key_len, key->hash);
if (!transformed_key) {
ret = -ENOMEM;
- goto out_free_shash;
+ goto out_free_key;
}
shash->tfm = key_tfm;
@@ -299,15 +291,12 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
if (ret < 0)
goto out_free_transformed_key;
- kfree(shash);
crypto_free_shash(key_tfm);
return transformed_key;
out_free_transformed_key:
nvme_auth_free_key(transformed_key);
-out_free_shash:
- kfree(shash);
out_free_key:
crypto_free_shash(key_tfm);
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index 6115fef74c1e..f6ddbe553289 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -31,6 +31,7 @@ struct nvme_dhchap_queue_context {
u32 s1;
u32 s2;
bool bi_directional;
+ bool authenticated;
u16 transaction;
u8 status;
u8 dhgroup_id;
@@ -682,6 +683,7 @@ static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
nvme_auth_reset_dhchap(chap);
+ chap->authenticated = false;
if (chap->shash_tfm)
crypto_free_shash(chap->shash_tfm);
if (chap->dh_tfm)
@@ -930,12 +932,14 @@ static void nvme_queue_auth_work(struct work_struct *work)
}
if (!ret) {
chap->error = 0;
+ chap->authenticated = true;
if (ctrl->opts->concat &&
(ret = nvme_auth_secure_concat(ctrl, chap))) {
dev_warn(ctrl->device,
"%s: qid %d failed to enable secure concatenation\n",
__func__, chap->qid);
chap->error = ret;
+ chap->authenticated = false;
}
return;
}
@@ -1023,13 +1027,16 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
return;
for (q = 1; q < ctrl->queue_count; q++) {
- ret = nvme_auth_negotiate(ctrl, q);
- if (ret) {
- dev_warn(ctrl->device,
- "qid %d: error %d setting up authentication\n",
- q, ret);
- break;
- }
+ struct nvme_dhchap_queue_context *chap =
+ &ctrl->dhchap_ctxs[q];
+ /*
+ * Skip re-authentication if the queue had
+ * not been authenticated initially.
+ */
+ if (!chap->authenticated)
+ continue;
+ cancel_work_sync(&chap->auth_work);
+ queue_work(nvme_auth_wq, &chap->auth_work);
}
/*
@@ -1037,7 +1044,13 @@ static void nvme_ctrl_auth_work(struct work_struct *work)
* the controller terminates the connection.
*/
for (q = 1; q < ctrl->queue_count; q++) {
- ret = nvme_auth_wait(ctrl, q);
+ struct nvme_dhchap_queue_context *chap =
+ &ctrl->dhchap_ctxs[q];
+ if (!chap->authenticated)
+ continue;
+ flush_work(&chap->auth_work);
+ ret = chap->error;
+ nvme_auth_reset_dhchap(chap);
if (ret)
dev_warn(ctrl->device,
"qid %d: authentication failed\n", q);
@@ -1076,6 +1089,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
chap = &ctrl->dhchap_ctxs[i];
chap->qid = i;
chap->ctrl = ctrl;
+ chap->authenticated = false;
INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6b04473c0ab7..f69a232a000a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -38,6 +38,8 @@ struct nvme_ns_info {
u32 nsid;
__le32 anagrpid;
u8 pi_offset;
+ u16 endgid;
+ u64 runs;
bool is_shared;
bool is_readonly;
bool is_ready;
@@ -150,6 +152,8 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
static void nvme_update_keep_alive(struct nvme_ctrl *ctrl,
struct nvme_command *cmd);
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi);
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
@@ -664,10 +668,11 @@ static void nvme_free_ns_head(struct kref *ref)
struct nvme_ns_head *head =
container_of(ref, struct nvme_ns_head, ref);
- nvme_mpath_remove_disk(head);
+ nvme_mpath_put_disk(head);
ida_free(&head->subsys->ns_ida, head->instance);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
+ kfree(head->plids);
kfree(head);
}
@@ -991,6 +996,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+ if (op == nvme_cmd_write && ns->head->nr_plids) {
+ u16 write_stream = req->bio->bi_write_stream;
+
+ if (WARN_ON_ONCE(write_stream > ns->head->nr_plids))
+ return BLK_STS_INVAL;
+
+ if (write_stream) {
+ dsmgmt |= ns->head->plids[write_stream - 1] << 16;
+ control |= NVME_RW_DTYPE_DPLCMT;
+ }
+ }
+
if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
return BLK_STS_INVAL;
@@ -1157,7 +1174,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
req->cmd_flags &= ~REQ_FAILFAST_DRIVER;
if (buffer && bufflen) {
- ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+ ret = blk_rq_map_kern(req, buffer, bufflen, GFP_KERNEL);
if (ret)
goto out;
}
@@ -1609,6 +1626,7 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
info->is_ready = true;
+ info->endgid = le16_to_cpu(id->endgid);
if (ctrl->quirks & NVME_QUIRK_BOGUS_NID) {
dev_info(ctrl->device,
"Ignoring bogus Namespace Identifiers\n");
@@ -1649,6 +1667,7 @@ static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,
info->is_ready = id->nstat & NVME_NSTAT_NRDY;
info->is_rotational = id->nsfeat & NVME_NS_ROTATIONAL;
info->no_vwc = id->nsfeat & NVME_NS_VWC_NOT_PRESENT;
+ info->endgid = le16_to_cpu(id->endgid);
}
kfree(id);
return ret;
@@ -1674,7 +1693,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
buflen, result);
@@ -1683,7 +1702,7 @@ EXPORT_SYMBOL_GPL(nvme_set_features);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result)
+ void *result)
{
return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
buflen, result);
@@ -2167,6 +2186,148 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns,
return ret;
}
+static int nvme_query_fdp_granularity(struct nvme_ctrl *ctrl,
+ struct nvme_ns_info *info, u8 fdp_idx)
+{
+ struct nvme_fdp_config_log hdr, *h;
+ struct nvme_fdp_config_desc *desc;
+ size_t size = sizeof(hdr);
+ void *log, *end;
+ int i, n, ret;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, &hdr, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log header status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ return ret;
+ }
+
+ size = le32_to_cpu(hdr.sze);
+ if (size > PAGE_SIZE * MAX_ORDER_NR_PAGES) {
+ dev_warn(ctrl->device, "FDP config size too large:%zu\n",
+ size);
+ return 0;
+ }
+
+ h = kvmalloc(size, GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ ret = nvme_get_log_lsi(ctrl, 0, NVME_LOG_FDP_CONFIGS, 0,
+ NVME_CSI_NVM, h, size, 0, info->endgid);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "FDP configs log status:0x%x endgid:%d\n", ret,
+ info->endgid);
+ goto out;
+ }
+
+ n = le16_to_cpu(h->numfdpc) + 1;
+ if (fdp_idx > n) {
+ dev_warn(ctrl->device, "FDP index:%d out of range:%d\n",
+ fdp_idx, n);
+ /* Proceed without registering FDP streams */
+ ret = 0;
+ goto out;
+ }
+
+ log = h + 1;
+ desc = log;
+ end = log + size - sizeof(*h);
+ for (i = 0; i < fdp_idx; i++) {
+ log += le16_to_cpu(desc->dsze);
+ desc = log;
+ if (log >= end) {
+ dev_warn(ctrl->device,
+ "FDP invalid config descriptor list\n");
+ ret = 0;
+ goto out;
+ }
+ }
+
+ if (le32_to_cpu(desc->nrg) > 1) {
+ dev_warn(ctrl->device, "FDP NRG > 1 not supported\n");
+ ret = 0;
+ goto out;
+ }
+
+ info->runs = le64_to_cpu(desc->runs);
+out:
+ kvfree(h);
+ return ret;
+}
+
+static int nvme_query_fdp_info(struct nvme_ns *ns, struct nvme_ns_info *info)
+{
+ struct nvme_ns_head *head = ns->head;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ struct nvme_fdp_ruh_status *ruhs;
+ struct nvme_fdp_config fdp;
+ struct nvme_command c = {};
+ size_t size;
+ int i, ret;
+
+ /*
+ * The FDP configuration is static for the lifetime of the namespace,
+ * so return immediately if we've already registered this namespace's
+ * streams.
+ */
+ if (head->nr_plids)
+ return 0;
+
+ ret = nvme_get_features(ctrl, NVME_FEAT_FDP, info->endgid, NULL, 0,
+ &fdp);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP get feature status:0x%x\n", ret);
+ return ret;
+ }
+
+ if (!(fdp.flags & FDPCFG_FDPE))
+ return 0;
+
+ ret = nvme_query_fdp_granularity(ctrl, info, fdp.fdpcidx);
+ if (!info->runs)
+ return ret;
+
+ size = struct_size(ruhs, ruhsd, S8_MAX - 1);
+ ruhs = kzalloc(size, GFP_KERNEL);
+ if (!ruhs)
+ return -ENOMEM;
+
+ c.imr.opcode = nvme_cmd_io_mgmt_recv;
+ c.imr.nsid = cpu_to_le32(head->ns_id);
+ c.imr.mo = NVME_IO_MGMT_RECV_MO_RUHS;
+ c.imr.numd = cpu_to_le32(nvme_bytes_to_numd(size));
+ ret = nvme_submit_sync_cmd(ns->queue, &c, ruhs, size);
+ if (ret) {
+ dev_warn(ctrl->device, "FDP io-mgmt status:0x%x\n", ret);
+ goto free;
+ }
+
+ head->nr_plids = le16_to_cpu(ruhs->nruhsd);
+ if (!head->nr_plids)
+ goto free;
+
+ head->plids = kcalloc(head->nr_plids, sizeof(*head->plids),
+ GFP_KERNEL);
+ if (!head->plids) {
+ dev_warn(ctrl->device,
+ "failed to allocate %u FDP placement IDs\n",
+ head->nr_plids);
+ head->nr_plids = 0;
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ for (i = 0; i < head->nr_plids; i++)
+ head->plids[i] = le16_to_cpu(ruhs->ruhsd[i].pid);
+free:
+ kfree(ruhs);
+ return ret;
+}
+
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
@@ -2204,6 +2365,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
goto out;
}
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_FDPS) {
+ ret = nvme_query_fdp_info(ns, info);
+ if (ret < 0)
+ goto out;
+ }
+
lim = queue_limits_start_update(ns->disk->queue);
memflags = blk_mq_freeze_queue(ns->disk->queue);
@@ -2248,6 +2415,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if (!nvme_init_integrity(ns->head, &lim, info))
capacity = 0;
+ lim.max_write_streams = ns->head->nr_plids;
+ if (lim.max_write_streams)
+ lim.write_stream_granularity = min(info->runs, U32_MAX);
+ else
+ lim.write_stream_granularity = 0;
+
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue, memflags);
@@ -2351,6 +2524,8 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head, &lim, info);
+ lim.max_write_streams = ns_lim->max_write_streams;
+ lim.write_stream_granularity = ns_lim->write_stream_granularity;
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
@@ -3108,8 +3283,8 @@ out_unlock:
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
- void *log, size_t size, u64 offset)
+static int nvme_get_log_lsi(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page,
+ u8 lsp, u8 csi, void *log, size_t size, u64 offset, u16 lsi)
{
struct nvme_command c = { };
u32 dwlen = nvme_bytes_to_numd(size);
@@ -3123,10 +3298,18 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
c.get_log_page.csi = csi;
+ c.get_log_page.lsi = cpu_to_le16(lsi);
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
+ void *log, size_t size, u64 offset)
+{
+ return nvme_get_log_lsi(ctrl, nsid, log_page, lsp, csi, log, size,
+ offset, 0);
+}
+
static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
struct nvme_effects_log **log)
{
@@ -3584,7 +3767,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
*/
if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
continue;
- if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
+ if (nvme_tryget_ns_head(h))
return h;
}
@@ -3828,7 +4011,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
}
} else {
ret = -EINVAL;
- if (!info->is_shared || !head->shared) {
+ if ((!info->is_shared || !head->shared) &&
+ !list_empty(&head->list)) {
dev_err(ctrl->device,
"Duplicate unshared namespace %d\n",
info->nsid);
@@ -4032,7 +4216,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
if (list_empty(&ns->head->list)) {
- list_del_init(&ns->head->entry);
+ if (!nvme_mpath_queue_if_no_path(ns->head))
+ list_del_init(&ns->head->entry);
last_path = true;
}
mutex_unlock(&ns->ctrl->subsys->lock);
@@ -4053,7 +4238,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
- nvme_mpath_shutdown_disk(ns->head);
+ nvme_mpath_remove_disk(ns->head);
nvme_put_ns(ns);
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 2257c3c96dd2..fdafa3e9e66f 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1410,9 +1410,8 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
}
static void
-nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
+nvme_fc_xmt_ls_rsp_free(struct nvmefc_ls_rcv_op *lsop)
{
- struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
struct nvme_fc_rport *rport = lsop->rport;
struct nvme_fc_lport *lport = rport->lport;
unsigned long flags;
@@ -1434,6 +1433,14 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
}
static void
+nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
+{
+ struct nvmefc_ls_rcv_op *lsop = lsrsp->nvme_fc_private;
+
+ nvme_fc_xmt_ls_rsp_free(lsop);
+}
+
+static void
nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
{
struct nvme_fc_rport *rport = lsop->rport;
@@ -1450,7 +1457,7 @@ nvme_fc_xmt_ls_rsp(struct nvmefc_ls_rcv_op *lsop)
dev_warn(lport->dev,
"LLDD rejected LS RSP xmt: LS %d status %d\n",
w0->ls_cmd, ret);
- nvme_fc_xmt_ls_rsp_done(lsop->lsrsp);
+ nvme_fc_xmt_ls_rsp_free(lsop);
return;
}
}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index cf0ef4745564..878ea8b1a0ac 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -10,10 +10,61 @@
#include "nvme.h"
bool multipath = true;
-module_param(multipath, bool, 0444);
+static bool multipath_always_on;
+
+static int multipath_param_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ bool *arg = kp->arg;
+
+ ret = param_set_bool(val, kp);
+ if (ret)
+ return ret;
+
+ if (multipath_always_on && !*arg) {
+ pr_err("Can't disable multipath when multipath_always_on is configured.\n");
+ *arg = true;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct kernel_param_ops multipath_param_ops = {
+ .set = multipath_param_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(multipath, &multipath_param_ops, &multipath, 0444);
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
+static int multipath_always_on_set(const char *val,
+ const struct kernel_param *kp)
+{
+ int ret;
+ bool *arg = kp->arg;
+
+ ret = param_set_bool(val, kp);
+ if (ret < 0)
+ return ret;
+
+ if (*arg)
+ multipath = true;
+
+ return 0;
+}
+
+static const struct kernel_param_ops multipath_always_on_ops = {
+ .set = multipath_always_on_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(multipath_always_on, &multipath_always_on_ops,
+ &multipath_always_on, 0444);
+MODULE_PARM_DESC(multipath_always_on,
+ "create multipath node always except for private namespace with non-unique nsid; note that this also implicitly enables native multipath support");
+
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
@@ -442,7 +493,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
break;
}
}
- return false;
+
+ /*
+ * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
+ * not immediately fail I/O. Instead, requeue the I/O for the configured
+ * duration, anticipating that if there's a transient link failure then
+ * it may recover within this time window. This parameter is exported to
+ * userspace via sysfs, and its default value is zero. It is internally
+ * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
+ * non-zero, this flag is set to true. When zero, the flag is cleared.
+ */
+ return nvme_mpath_queue_if_no_path(head);
}
static void nvme_ns_head_submit_bio(struct bio *bio)
@@ -617,6 +678,40 @@ static void nvme_requeue_work(struct work_struct *work)
}
}
+static void nvme_remove_head(struct nvme_ns_head *head)
+{
+ if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+ /*
+ * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
+ * to allow multipath to fail all I/O.
+ */
+ kblockd_schedule_work(&head->requeue_work);
+
+ nvme_cdev_del(&head->cdev, &head->cdev_device);
+ synchronize_srcu(&head->srcu);
+ del_gendisk(head->disk);
+ nvme_put_ns_head(head);
+ }
+}
+
+static void nvme_remove_head_work(struct work_struct *work)
+{
+ struct nvme_ns_head *head = container_of(to_delayed_work(work),
+ struct nvme_ns_head, remove_work);
+ bool remove = false;
+
+ mutex_lock(&head->subsys->lock);
+ if (list_empty(&head->list)) {
+ list_del_init(&head->entry);
+ remove = true;
+ }
+ mutex_unlock(&head->subsys->lock);
+ if (remove)
+ nvme_remove_head(head);
+
+ module_put(THIS_MODULE);
+}
+
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct queue_limits lim;
@@ -626,14 +721,25 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work);
INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
+ INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
+ head->delayed_removal_secs = 0;
/*
- * Add a multipath node if the subsystems supports multiple controllers.
- * We also do this for private namespaces as the namespace sharing flag
- * could change after a rescan.
+ * If "multipath_always_on" is enabled, a multipath node is added
+ * regardless of whether the disk is single/multi ported, and whether
+ * the namespace is shared or private. If "multipath_always_on" is not
+ * enabled, a multipath node is added only if the subsystem supports
+ * multiple controllers and the "multipath" option is configured. In
+ * either case, for private namespaces, we ensure that the NSID is
+ * unique.
*/
- if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
- !nvme_is_unique_nsid(ctrl, head) || !multipath)
+ if (!multipath_always_on) {
+ if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
+ !multipath)
+ return 0;
+ }
+
+ if (!nvme_is_unique_nsid(ctrl, head))
return 0;
blk_set_stacking_limits(&lim);
@@ -660,6 +766,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
sprintf(head->disk->disk_name, "nvme%dn%d",
ctrl->subsys->instance, head->instance);
+ nvme_tryget_ns_head(head);
return 0;
}
@@ -1016,6 +1123,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
}
DEVICE_ATTR_RO(numa_nodes);
+static ssize_t delayed_removal_secs_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+ int ret;
+
+ mutex_lock(&head->subsys->lock);
+ ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
+ mutex_unlock(&head->subsys->lock);
+ return ret;
+}
+
+static ssize_t delayed_removal_secs_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns_head *head = disk->private_data;
+ unsigned int sec;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &sec);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&head->subsys->lock);
+ head->delayed_removal_secs = sec;
+ if (sec)
+ set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+ else
+ clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+ mutex_unlock(&head->subsys->lock);
+ /*
+ * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
+ * by its reader.
+ */
+ synchronize_srcu(&head->srcu);
+
+ return count;
+}
+
+DEVICE_ATTR_RW(delayed_removal_secs);
+
static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
@@ -1137,23 +1287,43 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
#endif
}
-void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
+void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
- if (!head->disk)
- return;
- if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
- nvme_cdev_del(&head->cdev, &head->cdev_device);
+ bool remove = false;
+
+ mutex_lock(&head->subsys->lock);
+ /*
+ * We are called when all paths have been removed, and at that point
+ * head->list is expected to be empty. However, nvme_remove_ns() and
+ * nvme_init_ns_head() can run concurrently and so if head->delayed_
+ * removal_secs is configured, it is possible that by the time we reach
+ * this point, head->list may no longer be empty. Therefore, we recheck
+ * head->list here. If it is no longer empty then we skip enqueuing the
+ * delayed head removal work.
+ */
+ if (!list_empty(&head->list))
+ goto out;
+
+ if (head->delayed_removal_secs) {
/*
- * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
- * to allow multipath to fail all I/O.
+ * Ensure that no one could remove this module while the head
+ * remove work is pending.
*/
- synchronize_srcu(&head->srcu);
- kblockd_schedule_work(&head->requeue_work);
- del_gendisk(head->disk);
+ if (!try_module_get(THIS_MODULE))
+ goto out;
+ queue_delayed_work(nvme_wq, &head->remove_work,
+ head->delayed_removal_secs * HZ);
+ } else {
+ list_del_init(&head->entry);
+ remove = true;
}
+out:
+ mutex_unlock(&head->subsys->lock);
+ if (remove)
+ nvme_remove_head(head);
}
-void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
if (!head->disk)
return;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8fc4683418a3..ad0c1f834f09 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -497,6 +497,9 @@ struct nvme_ns_head {
struct device cdev_device;
struct gendisk *disk;
+
+ u16 nr_plids;
+ u16 *plids;
#ifdef CONFIG_NVME_MULTIPATH
struct bio_list requeue_list;
spinlock_t requeue_lock;
@@ -504,7 +507,10 @@ struct nvme_ns_head {
struct work_struct partition_scan_work;
struct mutex lock;
unsigned long flags;
-#define NVME_NSHEAD_DISK_LIVE 0
+ struct delayed_work remove_work;
+ unsigned int delayed_removal_secs;
+#define NVME_NSHEAD_DISK_LIVE 0
+#define NVME_NSHEAD_QUEUE_IF_NO_PATH 1
struct nvme_ns __rcu *current_path[];
#endif
};
@@ -897,10 +903,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
int qid, nvme_submit_flags_t flags);
int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen,
- u32 *result);
+ void *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
@@ -961,7 +967,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
-void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+void nvme_mpath_put_disk(struct nvme_ns_head *head);
int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_mpath_update(struct nvme_ctrl *ctrl);
@@ -970,7 +976,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl);
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
-void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
+void nvme_mpath_remove_disk(struct nvme_ns_head *head);
void nvme_mpath_start_request(struct request *rq);
void nvme_mpath_end_request(struct request *rq);
@@ -987,12 +993,19 @@ extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute dev_attr_queue_depth;
extern struct device_attribute dev_attr_numa_nodes;
+extern struct device_attribute dev_attr_delayed_removal_secs;
extern struct device_attribute subsys_attr_iopolicy;
static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return disk->fops == &nvme_ns_head_ops;
}
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+ if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
+ return true;
+ return false;
+}
#else
#define multipath false
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -1013,7 +1026,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
{
}
-static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_put_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
@@ -1032,7 +1045,7 @@ static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
}
-static inline void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
+static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
static inline void nvme_trace_bio_complete(struct request *req)
@@ -1080,6 +1093,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return false;
}
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+ return false;
+}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f1dd804151b1..e0bfe04a2bc2 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/nodemask.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/suspend.h>
@@ -34,16 +35,31 @@
#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
-#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+/* Optimisation for I/Os between 4k and 128k */
+#define NVME_SMALL_POOL_SIZE 256
/*
* These can be higher, but we need to ensure that any command doesn't
* require an sg allocation that needs more than a page of data.
*/
#define NVME_MAX_KB_SZ 8192
-#define NVME_MAX_SEGS 128
-#define NVME_MAX_META_SEGS 15
-#define NVME_MAX_NR_ALLOCATIONS 5
+#define NVME_MAX_NR_DESCRIPTORS 5
+
+/*
+ * For data SGLs we support a single descriptors worth of SGL entries, but for
+ * now we also limit it to avoid an allocation larger than PAGE_SIZE for the
+ * scatterlist.
+ */
+#define NVME_MAX_SEGS \
+ min(NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc), \
+ (PAGE_SIZE / sizeof(struct scatterlist)))
+
+/*
+ * For metadata SGLs, only the small descriptor is supported, and the first
+ * entry is the segment descriptor, which for the data pointer sits in the SQE.
+ */
+#define NVME_MAX_META_SEGS \
+ ((NVME_SMALL_POOL_SIZE / sizeof(struct nvme_sgl_desc)) - 1)
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -112,6 +128,11 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static void nvme_delete_io_queues(struct nvme_dev *dev);
static void nvme_update_attrs(struct nvme_dev *dev);
+struct nvme_descriptor_pools {
+ struct dma_pool *large;
+ struct dma_pool *small;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
@@ -121,8 +142,6 @@ struct nvme_dev {
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
- struct dma_pool *prp_page_pool;
- struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
@@ -162,6 +181,7 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+ struct nvme_descriptor_pools descriptor_pools[];
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -191,6 +211,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
*/
struct nvme_queue {
struct nvme_dev *dev;
+ struct nvme_descriptor_pools descriptor_pools;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
@@ -219,30 +240,30 @@ struct nvme_queue {
struct completion delete_done;
};
-union nvme_descriptor {
- struct nvme_sgl_desc *sg_list;
- __le64 *prp_list;
+/* bits for iod->flags */
+enum nvme_iod_flags {
+ /* this command has been aborted by the timeout handler */
+ IOD_ABORTED = 1U << 0,
+
+ /* uses the small descriptor pool */
+ IOD_SMALL_DESCRIPTOR = 1U << 1,
};
/*
* The nvme_iod describes the data in an I/O.
- *
- * The sg pointer contains the list of PRP/SGL chunk allocations in addition
- * to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
struct nvme_command cmd;
- bool aborted;
- s8 nr_allocations; /* PRP list pool allocations. 0 means small
- pool in use */
+ u8 flags;
+ u8 nr_descriptors;
unsigned int dma_len; /* length of single DMA segment mapping */
dma_addr_t first_dma;
dma_addr_t meta_dma;
struct sg_table sgt;
struct sg_table meta_sgt;
- union nvme_descriptor meta_list;
- union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
+ struct nvme_sgl_desc *meta_descriptor;
+ void *descriptors[NVME_MAX_NR_DESCRIPTORS];
};
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
@@ -397,30 +418,78 @@ static __always_inline int nvme_pci_npages_prp(void)
return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
}
-static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static struct nvme_descriptor_pools *
+nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
{
- struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[0];
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[numa_node];
+ size_t small_align = NVME_SMALL_POOL_SIZE;
- WARN_ON(hctx_idx != 0);
- WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+ if (pools->small)
+ return pools; /* already initialized */
- hctx->driver_data = nvmeq;
- return 0;
+ pools->large = dma_pool_create_node("nvme descriptor page", dev->dev,
+ NVME_CTRL_PAGE_SIZE, NVME_CTRL_PAGE_SIZE, 0, numa_node);
+ if (!pools->large)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+ small_align = 512;
+
+ pools->small = dma_pool_create_node("nvme descriptor small", dev->dev,
+ NVME_SMALL_POOL_SIZE, small_align, 0, numa_node);
+ if (!pools->small) {
+ dma_pool_destroy(pools->large);
+ pools->large = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return pools;
}
-static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static void nvme_release_descriptor_pools(struct nvme_dev *dev)
+{
+ unsigned i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ struct nvme_descriptor_pools *pools = &dev->descriptor_pools[i];
+
+ dma_pool_destroy(pools->large);
+ dma_pool_destroy(pools->small);
+ }
+}
+
+static int nvme_init_hctx_common(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned qid)
{
struct nvme_dev *dev = to_nvme_dev(data);
- struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+ struct nvme_descriptor_pools *pools;
+ struct blk_mq_tags *tags;
+
+ tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
+ WARN_ON(tags != hctx->tags);
+ pools = nvme_setup_descriptor_pools(dev, hctx->numa_node);
+ if (IS_ERR(pools))
+ return PTR_ERR(pools);
- WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
+ nvmeq->descriptor_pools = *pools;
hctx->driver_data = nvmeq;
return 0;
}
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ WARN_ON(hctx_idx != 0);
+ return nvme_init_hctx_common(hctx, data, 0);
+}
+
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ return nvme_init_hctx_common(hctx, data, hctx_idx + 1);
+}
+
static int nvme_pci_init_request(struct blk_mq_tag_set *set,
struct request *req, unsigned int hctx_idx,
unsigned int numa_node)
@@ -537,23 +606,39 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
return true;
}
-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+static inline struct dma_pool *nvme_dma_pool(struct nvme_queue *nvmeq,
+ struct nvme_iod *iod)
+{
+ if (iod->flags & IOD_SMALL_DESCRIPTOR)
+ return nvmeq->descriptor_pools.small;
+ return nvmeq->descriptor_pools.large;
+}
+
+static void nvme_free_descriptors(struct nvme_queue *nvmeq, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
dma_addr_t dma_addr = iod->first_dma;
int i;
- for (i = 0; i < iod->nr_allocations; i++) {
- __le64 *prp_list = iod->list[i].prp_list;
+ if (iod->nr_descriptors == 1) {
+ dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
+ dma_addr);
+ return;
+ }
+
+ for (i = 0; i < iod->nr_descriptors; i++) {
+ __le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+ dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
+ dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -566,15 +651,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->sgt.nents);
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-
- if (iod->nr_allocations == 0)
- dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
- iod->first_dma);
- else if (iod->nr_allocations == 1)
- dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
- iod->first_dma);
- else
- nvme_free_prps(dev, req);
+ nvme_free_descriptors(nvmeq, req);
mempool_free(iod->sgt.sgl, dev->iod_mempool);
}
@@ -592,11 +669,10 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
}
}
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
int length = blk_rq_payload_bytes(req);
struct scatterlist *sg = iod->sgt.sgl;
int dma_len = sg_dma_len(sg);
@@ -604,7 +680,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
dma_addr_t prp_dma;
- int nprps, i;
+ int i;
length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
@@ -626,30 +702,26 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
goto done;
}
- nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
- if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+ NVME_SMALL_POOL_SIZE / sizeof(__le64))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
- if (!prp_list) {
- iod->nr_allocations = -1;
+ prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &prp_dma);
+ if (!prp_list)
return BLK_STS_RESOURCE;
- }
- iod->list[0].prp_list = prp_list;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
iod->first_dma = prp_dma;
i = 0;
for (;;) {
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
- prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
+
+ prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+ GFP_ATOMIC, &prp_dma);
if (!prp_list)
goto free_prps;
- iod->list[iod->nr_allocations++].prp_list = prp_list;
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
prp_list[0] = old_prp_list[i - 1];
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
i = 1;
@@ -673,7 +745,7 @@ done:
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
return BLK_STS_OK;
free_prps:
- nvme_free_prps(dev, req);
+ nvme_free_descriptors(nvmeq, req);
return BLK_STS_RESOURCE;
bad_sgl:
WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
@@ -698,11 +770,10 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sgt.sgl;
unsigned int entries = iod->sgt.nents;
@@ -717,21 +788,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
return BLK_STS_OK;
}
- if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
- pool = dev->prp_small_pool;
- iod->nr_allocations = 0;
- } else {
- pool = dev->prp_page_pool;
- iod->nr_allocations = 1;
- }
+ if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
- sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
- if (!sg_list) {
- iod->nr_allocations = -1;
+ sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
return BLK_STS_RESOURCE;
- }
-
- iod->list[0].sg_list = sg_list;
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
iod->first_dma = sgl_dma;
nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
@@ -785,12 +849,12 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret = BLK_STS_RESOURCE;
int rc;
if (blk_rq_nr_phys_segments(req) == 1) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
@@ -825,9 +889,9 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
}
if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
if (ret != BLK_STS_OK)
goto out_unmap_sg;
return BLK_STS_OK;
@@ -842,6 +906,7 @@ out_free_sg:
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_rw_command *cmnd = &iod->cmd.rw;
struct nvme_sgl_desc *sg_list;
@@ -865,12 +930,13 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
if (rc)
goto out_free_sg;
- sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+ sg_list = dma_pool_alloc(nvmeq->descriptor_pools.small, GFP_ATOMIC,
+ &sgl_dma);
if (!sg_list)
goto out_unmap_sg;
entries = iod->meta_sgt.nents;
- iod->meta_list.sg_list = sg_list;
+ iod->meta_descriptor = sg_list;
iod->meta_dma = sgl_dma;
cmnd->flags = NVME_CMD_SGL_METASEG;
@@ -912,7 +978,10 @@ static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
{
- if (nvme_pci_metadata_use_sgls(dev, req))
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ if ((iod->cmd.common.flags & NVME_CMD_SGL_METABUF) &&
+ nvme_pci_metadata_use_sgls(dev, req))
return nvme_pci_setup_meta_sgls(dev, req);
return nvme_pci_setup_meta_mptr(dev, req);
}
@@ -922,8 +991,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret;
- iod->aborted = false;
- iod->nr_allocations = -1;
+ iod->flags = 0;
+ iod->nr_descriptors = 0;
iod->sgt.nents = 0;
iod->meta_sgt.nents = 0;
@@ -947,7 +1016,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
return BLK_STS_OK;
out_unmap_data:
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
@@ -1037,6 +1106,7 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
}
static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+ struct nvme_queue *nvmeq,
struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1048,8 +1118,8 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
return;
}
- dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
- iod->meta_dma);
+ dma_pool_free(nvmeq->descriptor_pools.small, iod->meta_descriptor,
+ iod->meta_dma);
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
}
@@ -1060,10 +1130,10 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
struct nvme_dev *dev = nvmeq->dev;
if (blk_integrity_rq(req))
- nvme_unmap_metadata(dev, req);
+ nvme_unmap_metadata(dev, nvmeq, req);
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(dev, nvmeq, req);
}
static void nvme_pci_complete_rq(struct request *req)
@@ -1490,7 +1560,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
* returned to the driver, or if this is the admin queue.
*/
opcode = nvme_req(req)->cmd->common.opcode;
- if (!nvmeq->qid || iod->aborted) {
+ if (!nvmeq->qid || (iod->flags & IOD_ABORTED)) {
dev_warn(dev->ctrl.device,
"I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
req->tag, nvme_cid(req), opcode,
@@ -1503,7 +1573,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER;
}
- iod->aborted = true;
+ iod->flags |= IOD_ABORTED;
cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req);
@@ -2842,35 +2912,6 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return 0;
}
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
-{
- size_t small_align = 256;
-
- dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
- NVME_CTRL_PAGE_SIZE,
- NVME_CTRL_PAGE_SIZE, 0);
- if (!dev->prp_page_pool)
- return -ENOMEM;
-
- if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
- small_align = 512;
-
- /* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, small_align, 0);
- if (!dev->prp_small_pool) {
- dma_pool_destroy(dev->prp_page_pool);
- return -ENOMEM;
- }
- return 0;
-}
-
-static void nvme_release_prp_pools(struct nvme_dev *dev)
-{
- dma_pool_destroy(dev->prp_page_pool);
- dma_pool_destroy(dev->prp_small_pool);
-}
-
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
@@ -3185,7 +3226,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
struct nvme_dev *dev;
int ret = -ENOMEM;
- dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ dev = kzalloc_node(struct_size(dev, descriptor_pools, nr_node_ids),
+ GFP_KERNEL, node);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
@@ -3260,13 +3302,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto out_uninit_ctrl;
- result = nvme_setup_prp_pools(dev);
- if (result)
- goto out_dev_unmap;
-
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
- goto out_release_prp_pools;
+ goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
@@ -3342,8 +3380,6 @@ out_disable:
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
mempool_destroy(dev->iod_meta_mempool);
-out_release_prp_pools:
- nvme_release_prp_pools(dev);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
@@ -3408,7 +3444,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_free_queues(dev, 0);
mempool_destroy(dev->iod_mempool);
mempool_destroy(dev->iod_meta_mempool);
- nvme_release_prp_pools(dev);
+ nvme_release_descriptor_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
}
@@ -3809,9 +3845,7 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
- BUILD_BUG_ON(NVME_MAX_SEGS > SGES_PER_PAGE);
- BUILD_BUG_ON(sizeof(struct scatterlist) * NVME_MAX_SEGS > PAGE_SIZE);
- BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS);
+ BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_DESCRIPTORS);
return pci_register_driver(&nvme_driver);
}
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index a5bc3bb483d5..29430949ce2f 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
&dev_attr_ana_state.attr,
&dev_attr_queue_depth.attr,
&dev_attr_numa_nodes.attr,
+ &dev_attr_delayed_removal_secs.attr,
#endif
&dev_attr_io_passthru_err_log_enabled.attr,
NULL,
@@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
if (nvme_disk_is_ns_head(dev_to_disk(dev)))
return 0;
}
+ if (a == &dev_attr_delayed_removal_secs.attr) {
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!nvme_disk_is_ns_head(disk))
+ return 0;
+ }
#endif
return a->mode;
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index aba365f97cf6..853bc67d045c 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -403,7 +403,7 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
}
static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
- bool sync, bool last)
+ bool last)
{
struct nvme_tcp_queue *queue = req->queue;
bool empty;
@@ -417,7 +417,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
* are on the same cpu, so we don't introduce contention.
*/
if (queue->io_cpu == raw_smp_processor_id() &&
- sync && empty && mutex_trylock(&queue->send_mutex)) {
+ empty && mutex_trylock(&queue->send_mutex)) {
nvme_tcp_send_all(queue);
mutex_unlock(&queue->send_mutex);
}
@@ -770,7 +770,9 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
req->ttag = pdu->ttag;
nvme_tcp_setup_h2c_data_pdu(req);
- nvme_tcp_queue_request(req, false, true);
+
+ llist_add(&req->lentry, &queue->req_list);
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
return 0;
}
@@ -2385,7 +2387,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (ret)
return ret;
- if (ctrl->opts && ctrl->opts->concat && !ctrl->tls_pskid) {
+ if (ctrl->opts->concat && !ctrl->tls_pskid) {
/* See comments for nvme_tcp_key_revoke_needed() */
dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n");
nvme_stop_keep_alive(ctrl);
@@ -2637,7 +2639,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req, true, true);
+ nvme_tcp_queue_request(&ctrl->async_req, true);
}
static void nvme_tcp_complete_timed_out(struct request *rq)
@@ -2789,7 +2791,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(rq);
- nvme_tcp_queue_request(req, true, bd->last);
+ nvme_tcp_queue_request(req, bd->last);
return BLK_STS_OK;
}
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index acc138bbf8f2..c7317299078d 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -63,14 +63,9 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
if (status != NVME_SC_SUCCESS)
goto complete;
- /*
- * Note: The NVMe specification allows multiple SQs to use the same CQ.
- * However, the target code does not really support that. So for now,
- * prevent this and fail the command if sqid and cqid are different.
- */
- if (!cqid || cqid != sqid) {
- pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid);
- status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR;
+ status = nvmet_check_io_cqid(ctrl, cqid, false);
+ if (status != NVME_SC_SUCCESS) {
+ pr_err("SQ %u: Invalid CQID %u\n", sqid, cqid);
goto complete;
}
@@ -79,7 +74,7 @@ static void nvmet_execute_create_sq(struct nvmet_req *req)
goto complete;
}
- status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1);
+ status = ctrl->ops->create_sq(ctrl, sqid, cqid, sq_flags, qsize, prp1);
complete:
nvmet_req_complete(req, status);
@@ -96,14 +91,15 @@ static void nvmet_execute_delete_cq(struct nvmet_req *req)
goto complete;
}
- if (!cqid) {
- status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+ status = nvmet_check_io_cqid(ctrl, cqid, false);
+ if (status != NVME_SC_SUCCESS)
goto complete;
- }
- status = nvmet_check_cqid(ctrl, cqid);
- if (status != NVME_SC_SUCCESS)
+ if (!ctrl->cqs[cqid] || nvmet_cq_in_use(ctrl->cqs[cqid])) {
+ /* Some SQs are still using this CQ */
+ status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
goto complete;
+ }
status = ctrl->ops->delete_cq(ctrl, cqid);
@@ -127,12 +123,7 @@ static void nvmet_execute_create_cq(struct nvmet_req *req)
goto complete;
}
- if (!cqid) {
- status = NVME_SC_QID_INVALID | NVME_STATUS_DNR;
- goto complete;
- }
-
- status = nvmet_check_cqid(ctrl, cqid);
+ status = nvmet_check_io_cqid(ctrl, cqid, true);
if (status != NVME_SC_SUCCESS)
goto complete;
diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 9429b8218408..b340380f3892 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -280,9 +280,12 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl)
bool nvmet_check_auth_status(struct nvmet_req *req)
{
- if (req->sq->ctrl->host_key &&
- !req->sq->authenticated)
- return false;
+ if (req->sq->ctrl->host_key) {
+ if (req->sq->qid > 0)
+ return true;
+ if (!req->sq->authenticated)
+ return false;
+ }
return true;
}
@@ -290,7 +293,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
unsigned int shash_len)
{
struct crypto_shash *shash_tfm;
- struct shash_desc *shash;
+ SHASH_DESC_ON_STACK(shash, shash_tfm);
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
u8 *challenge = req->sq->dhchap_c1;
@@ -342,19 +345,13 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
req->sq->dhchap_c1,
challenge, shash_len);
if (ret)
- goto out_free_challenge;
+ goto out;
}
pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
req->sq->dhchap_tid);
- shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
- GFP_KERNEL);
- if (!shash) {
- ret = -ENOMEM;
- goto out_free_challenge;
- }
shash->tfm = shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
@@ -389,8 +386,6 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
goto out;
ret = crypto_shash_final(shash, response);
out:
- kfree(shash);
-out_free_challenge:
if (challenge != req->sq->dhchap_c1)
kfree(challenge);
out_free_response:
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 245475c43127..db7b17d1094e 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -813,11 +813,43 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status)
}
EXPORT_SYMBOL_GPL(nvmet_req_complete);
+void nvmet_cq_init(struct nvmet_cq *cq)
+{
+ refcount_set(&cq->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_init);
+
+bool nvmet_cq_get(struct nvmet_cq *cq)
+{
+ return refcount_inc_not_zero(&cq->ref);
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_get);
+
+void nvmet_cq_put(struct nvmet_cq *cq)
+{
+ if (refcount_dec_and_test(&cq->ref))
+ nvmet_cq_destroy(cq);
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_put);
+
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
u16 qid, u16 size)
{
cq->qid = qid;
cq->size = size;
+
+ ctrl->cqs[qid] = cq;
+}
+
+void nvmet_cq_destroy(struct nvmet_cq *cq)
+{
+ struct nvmet_ctrl *ctrl = cq->ctrl;
+
+ if (ctrl) {
+ ctrl->cqs[cq->qid] = NULL;
+ nvmet_ctrl_put(cq->ctrl);
+ cq->ctrl = NULL;
+ }
}
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
@@ -837,37 +869,47 @@ static void nvmet_confirm_sq(struct percpu_ref *ref)
complete(&sq->confirm_done);
}
-u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid)
+u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
{
- if (!ctrl->sqs)
+ if (!ctrl->cqs)
return NVME_SC_INTERNAL | NVME_STATUS_DNR;
if (cqid > ctrl->subsys->max_qid)
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
- /*
- * Note: For PCI controllers, the NVMe specifications allows multiple
- * SQs to share a single CQ. However, we do not support this yet, so
- * check that there is no SQ defined for a CQ. If one exist, then the
- * CQ ID is invalid for creation as well as when the CQ is being
- * deleted (as that would mean that the SQ was not deleted before the
- * CQ).
- */
- if (ctrl->sqs[cqid])
+ if ((create && ctrl->cqs[cqid]) || (!create && !ctrl->cqs[cqid]))
return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
return NVME_SC_SUCCESS;
}
+u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create)
+{
+ if (!cqid)
+ return NVME_SC_QID_INVALID | NVME_STATUS_DNR;
+ return nvmet_check_cqid(ctrl, cqid, create);
+}
+
+bool nvmet_cq_in_use(struct nvmet_cq *cq)
+{
+ return refcount_read(&cq->ref) > 1;
+}
+EXPORT_SYMBOL_GPL(nvmet_cq_in_use);
+
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq,
u16 qid, u16 size)
{
u16 status;
- status = nvmet_check_cqid(ctrl, qid);
+ status = nvmet_check_cqid(ctrl, qid, true);
if (status != NVME_SC_SUCCESS)
return status;
+ if (!kref_get_unless_zero(&ctrl->ref))
+ return NVME_SC_INTERNAL | NVME_STATUS_DNR;
+ cq->ctrl = ctrl;
+
+ nvmet_cq_init(cq);
nvmet_cq_setup(ctrl, cq, qid, size);
return NVME_SC_SUCCESS;
@@ -891,7 +933,7 @@ u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid,
}
u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
- u16 sqid, u16 size)
+ struct nvmet_cq *cq, u16 sqid, u16 size)
{
u16 status;
int ret;
@@ -903,7 +945,7 @@ u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
if (status != NVME_SC_SUCCESS)
return status;
- ret = nvmet_sq_init(sq);
+ ret = nvmet_sq_init(sq, cq);
if (ret) {
status = NVME_SC_INTERNAL | NVME_STATUS_DNR;
goto ctrl_put;
@@ -935,6 +977,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
wait_for_completion(&sq->free_done);
percpu_ref_exit(&sq->ref);
nvmet_auth_sq_free(sq);
+ nvmet_cq_put(sq->cq);
/*
* we must reference the ctrl again after waiting for inflight IO
@@ -967,18 +1010,23 @@ static void nvmet_sq_free(struct percpu_ref *ref)
complete(&sq->free_done);
}
-int nvmet_sq_init(struct nvmet_sq *sq)
+int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq)
{
int ret;
+ if (!nvmet_cq_get(cq))
+ return -EINVAL;
+
ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL);
if (ret) {
pr_err("percpu_ref init failed!\n");
+ nvmet_cq_put(cq);
return ret;
}
init_completion(&sq->free_done);
init_completion(&sq->confirm_done);
nvmet_auth_sq_init(sq);
+ sq->cq = cq;
return 0;
}
@@ -1108,13 +1156,13 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
return ret;
}
-bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
- struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops)
+bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
+ const struct nvmet_fabrics_ops *ops)
{
u8 flags = req->cmd->common.flags;
u16 status;
- req->cq = cq;
+ req->cq = sq->cq;
req->sq = sq;
req->ops = ops;
req->sg = NULL;
@@ -1612,12 +1660,17 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
if (!ctrl->sqs)
goto out_free_changed_ns_list;
+ ctrl->cqs = kcalloc(subsys->max_qid + 1, sizeof(struct nvmet_cq *),
+ GFP_KERNEL);
+ if (!ctrl->cqs)
+ goto out_free_sqs;
+
ret = ida_alloc_range(&cntlid_ida,
subsys->cntlid_min, subsys->cntlid_max,
GFP_KERNEL);
if (ret < 0) {
args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
- goto out_free_sqs;
+ goto out_free_cqs;
}
ctrl->cntlid = ret;
@@ -1676,6 +1729,8 @@ init_pr_fail:
mutex_unlock(&subsys->lock);
nvmet_stop_keep_alive_timer(ctrl);
ida_free(&cntlid_ida, ctrl->cntlid);
+out_free_cqs:
+ kfree(ctrl->cqs);
out_free_sqs:
kfree(ctrl->sqs);
out_free_changed_ns_list:
@@ -1712,6 +1767,7 @@ static void nvmet_ctrl_free(struct kref *ref)
nvmet_async_events_free(ctrl);
kfree(ctrl->sqs);
+ kfree(ctrl->cqs);
kfree(ctrl->changed_ns_list);
kfree(ctrl);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index df7207640506..c06f3e04296c 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -119,7 +119,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE);
memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
- strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
+ strscpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
}
/*
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index f012bdf89850..7b8d8b397802 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -208,6 +208,14 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
}
+ kref_get(&ctrl->ref);
+ old = cmpxchg(&req->cq->ctrl, NULL, ctrl);
+ if (old) {
+ pr_warn("queue already connected!\n");
+ req->error_loc = offsetof(struct nvmf_connect_command, opcode);
+ return NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR;
+ }
+
/* note: convert queue size from 0's-based value to 1's-based value */
nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
@@ -239,8 +247,8 @@ static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
bool needs_auth = nvmet_has_auth(ctrl, sq);
key_serial_t keyid = nvmet_queue_tls_keyid(sq);
- /* Do not authenticate I/O queues for secure concatenation */
- if (ctrl->concat && sq->qid)
+ /* Do not authenticate I/O queues */
+ if (sq->qid)
needs_auth = false;
if (keyid)
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 7b50130f10f6..254537b93e63 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -816,7 +816,8 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
nvmet_fc_prep_fcp_iodlist(assoc->tgtport, queue);
- ret = nvmet_sq_init(&queue->nvme_sq);
+ nvmet_cq_init(&queue->nvme_cq);
+ ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret)
goto out_fail_iodlist;
@@ -826,6 +827,7 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
return queue;
out_fail_iodlist:
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_fc_destroy_fcp_iodlist(assoc->tgtport, queue);
destroy_workqueue(queue->work_q);
out_free_queue:
@@ -934,6 +936,7 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
flush_workqueue(queue->work_q);
nvmet_sq_destroy(&queue->nvme_sq);
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_fc_tgt_q_put(queue);
}
@@ -1254,6 +1257,7 @@ nvmet_fc_portentry_bind(struct nvmet_fc_tgtport *tgtport,
{
lockdep_assert_held(&nvmet_fc_tgtlock);
+ nvmet_fc_tgtport_get(tgtport);
pe->tgtport = tgtport;
tgtport->pe = pe;
@@ -1273,8 +1277,10 @@ nvmet_fc_portentry_unbind(struct nvmet_fc_port_entry *pe)
unsigned long flags;
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
- if (pe->tgtport)
+ if (pe->tgtport) {
+ nvmet_fc_tgtport_put(pe->tgtport);
pe->tgtport->pe = NULL;
+ }
list_del(&pe->pe_list);
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
@@ -1292,8 +1298,10 @@ nvmet_fc_portentry_unbind_tgt(struct nvmet_fc_tgtport *tgtport)
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
pe = tgtport->pe;
- if (pe)
+ if (pe) {
+ nvmet_fc_tgtport_put(pe->tgtport);
pe->tgtport = NULL;
+ }
tgtport->pe = NULL;
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
@@ -1316,6 +1324,9 @@ nvmet_fc_portentry_rebind_tgt(struct nvmet_fc_tgtport *tgtport)
list_for_each_entry(pe, &nvmet_fc_portentry_list, pe_list) {
if (tgtport->fc_target_port.node_name == pe->node_name &&
tgtport->fc_target_port.port_name == pe->port_name) {
+ if (!nvmet_fc_tgtport_get(tgtport))
+ continue;
+
WARN_ON(pe->tgtport);
tgtport->pe = pe;
pe->tgtport = tgtport;
@@ -1580,6 +1591,39 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
}
+static void
+nvmet_fc_free_pending_reqs(struct nvmet_fc_tgtport *tgtport)
+{
+ struct nvmet_fc_ls_req_op *lsop;
+ struct nvmefc_ls_req *lsreq;
+ struct nvmet_fc_ls_iod *iod;
+ int i;
+
+ iod = tgtport->iod;
+ for (i = 0; i < NVMET_LS_CTX_COUNT; iod++, i++)
+ cancel_work(&iod->work);
+
+ /*
+ * After this point the connection is lost and thus any pending
+ * request can't be processed by the normal completion path. This
+ * is likely a request from nvmet_fc_send_ls_req_async.
+ */
+ while ((lsop = list_first_entry_or_null(&tgtport->ls_req_list,
+ struct nvmet_fc_ls_req_op, lsreq_list))) {
+ list_del(&lsop->lsreq_list);
+
+ if (!lsop->req_queued)
+ continue;
+
+ lsreq = &lsop->ls_req;
+ fc_dma_unmap_single(tgtport->dev, lsreq->rqstdma,
+ (lsreq->rqstlen + lsreq->rsplen),
+ DMA_BIDIRECTIONAL);
+ nvmet_fc_tgtport_put(tgtport);
+ kfree(lsop);
+ }
+}
+
/**
* nvmet_fc_unregister_targetport - transport entry point called by an
* LLDD to deregister/remove a previously
@@ -1608,13 +1652,7 @@ nvmet_fc_unregister_targetport(struct nvmet_fc_target_port *target_port)
flush_workqueue(nvmet_wq);
- /*
- * should terminate LS's as well. However, LS's will be generated
- * at the tail end of association termination, so they likely don't
- * exist yet. And even if they did, it's worthwhile to just let
- * them finish and targetport ref counting will clean things up.
- */
-
+ nvmet_fc_free_pending_reqs(tgtport);
nvmet_fc_tgtport_put(tgtport);
return 0;
@@ -2531,10 +2569,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
fod->data_sg = NULL;
fod->data_sg_cnt = 0;
- ret = nvmet_req_init(&fod->req,
- &fod->queue->nvme_cq,
- &fod->queue->nvme_sq,
- &nvmet_fc_tgt_fcp_ops);
+ ret = nvmet_req_init(&fod->req, &fod->queue->nvme_sq,
+ &nvmet_fc_tgt_fcp_ops);
if (!ret) {
/* bad SQE content or invalid ctrl state */
/* nvmet layer has already called op done to send rsp. */
@@ -2860,12 +2896,17 @@ nvmet_fc_add_port(struct nvmet_port *port)
list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
if ((tgtport->fc_target_port.node_name == traddr.nn) &&
(tgtport->fc_target_port.port_name == traddr.pn)) {
+ if (!nvmet_fc_tgtport_get(tgtport))
+ continue;
+
/* a FC port can only be 1 nvmet port id */
if (!tgtport->pe) {
nvmet_fc_portentry_bind(tgtport, pe, port);
ret = 0;
} else
ret = -EALREADY;
+
+ nvmet_fc_tgtport_put(tgtport);
break;
}
}
@@ -2881,11 +2922,21 @@ static void
nvmet_fc_remove_port(struct nvmet_port *port)
{
struct nvmet_fc_port_entry *pe = port->priv;
+ struct nvmet_fc_tgtport *tgtport = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+ if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
+ tgtport = pe->tgtport;
+ spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
nvmet_fc_portentry_unbind(pe);
- /* terminate any outstanding associations */
- __nvmet_fc_free_assocs(pe->tgtport);
+ if (tgtport) {
+ /* terminate any outstanding associations */
+ __nvmet_fc_free_assocs(tgtport);
+ nvmet_fc_tgtport_put(tgtport);
+ }
kfree(pe);
}
@@ -2894,10 +2945,21 @@ static void
nvmet_fc_discovery_chg(struct nvmet_port *port)
{
struct nvmet_fc_port_entry *pe = port->priv;
- struct nvmet_fc_tgtport *tgtport = pe->tgtport;
+ struct nvmet_fc_tgtport *tgtport = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
+ if (pe->tgtport && nvmet_fc_tgtport_get(pe->tgtport))
+ tgtport = pe->tgtport;
+ spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
+
+ if (!tgtport)
+ return;
if (tgtport && tgtport->ops->discovery_event)
tgtport->ops->discovery_event(&tgtport->fc_target_port);
+
+ nvmet_fc_tgtport_put(tgtport);
}
static ssize_t
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 641201e62c1b..257b497d515a 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -207,7 +207,6 @@ static LIST_HEAD(fcloop_nports);
struct fcloop_lport {
struct nvme_fc_local_port *localport;
struct list_head lport_list;
- struct completion unreg_done;
refcount_t ref;
};
@@ -215,6 +214,9 @@ struct fcloop_lport_priv {
struct fcloop_lport *lport;
};
+/* The port is already being removed, avoid double free */
+#define PORT_DELETED 0
+
struct fcloop_rport {
struct nvme_fc_remote_port *remoteport;
struct nvmet_fc_target_port *targetport;
@@ -223,6 +225,7 @@ struct fcloop_rport {
spinlock_t lock;
struct list_head ls_list;
struct work_struct ls_work;
+ unsigned long flags;
};
struct fcloop_tport {
@@ -233,6 +236,7 @@ struct fcloop_tport {
spinlock_t lock;
struct list_head ls_list;
struct work_struct ls_work;
+ unsigned long flags;
};
struct fcloop_nport {
@@ -288,6 +292,9 @@ struct fcloop_ini_fcpreq {
spinlock_t inilock;
};
+/* SLAB cache for fcloop_lsreq structures */
+static struct kmem_cache *lsreq_cache;
+
static inline struct fcloop_lsreq *
ls_rsp_to_lsreq(struct nvmefc_ls_rsp *lsrsp)
{
@@ -338,6 +345,7 @@ fcloop_rport_lsrqst_work(struct work_struct *work)
* callee may free memory containing tls_req.
* do not reference lsreq after this.
*/
+ kmem_cache_free(lsreq_cache, tls_req);
spin_lock(&rport->lock);
}
@@ -349,10 +357,13 @@ fcloop_h2t_ls_req(struct nvme_fc_local_port *localport,
struct nvme_fc_remote_port *remoteport,
struct nvmefc_ls_req *lsreq)
{
- struct fcloop_lsreq *tls_req = lsreq->private;
struct fcloop_rport *rport = remoteport->private;
+ struct fcloop_lsreq *tls_req;
int ret = 0;
+ tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
+ if (!tls_req)
+ return -ENOMEM;
tls_req->lsreq = lsreq;
INIT_LIST_HEAD(&tls_req->ls_list);
@@ -389,14 +400,17 @@ fcloop_h2t_xmt_ls_rsp(struct nvmet_fc_target_port *targetport,
lsrsp->done(lsrsp);
- if (remoteport) {
- rport = remoteport->private;
- spin_lock(&rport->lock);
- list_add_tail(&tls_req->ls_list, &rport->ls_list);
- spin_unlock(&rport->lock);
- queue_work(nvmet_wq, &rport->ls_work);
+ if (!remoteport) {
+ kmem_cache_free(lsreq_cache, tls_req);
+ return 0;
}
+ rport = remoteport->private;
+ spin_lock(&rport->lock);
+ list_add_tail(&tls_req->ls_list, &rport->ls_list);
+ spin_unlock(&rport->lock);
+ queue_work(nvmet_wq, &rport->ls_work);
+
return 0;
}
@@ -422,6 +436,7 @@ fcloop_tport_lsrqst_work(struct work_struct *work)
* callee may free memory containing tls_req.
* do not reference lsreq after this.
*/
+ kmem_cache_free(lsreq_cache, tls_req);
spin_lock(&tport->lock);
}
@@ -432,8 +447,8 @@ static int
fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
struct nvmefc_ls_req *lsreq)
{
- struct fcloop_lsreq *tls_req = lsreq->private;
struct fcloop_tport *tport = targetport->private;
+ struct fcloop_lsreq *tls_req;
int ret = 0;
/*
@@ -441,6 +456,10 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
* hosthandle ignored as fcloop currently is
* 1:1 tgtport vs remoteport
*/
+
+ tls_req = kmem_cache_alloc(lsreq_cache, GFP_KERNEL);
+ if (!tls_req)
+ return -ENOMEM;
tls_req->lsreq = lsreq;
INIT_LIST_HEAD(&tls_req->ls_list);
@@ -457,6 +476,9 @@ fcloop_t2h_ls_req(struct nvmet_fc_target_port *targetport, void *hosthandle,
ret = nvme_fc_rcv_ls_req(tport->remoteport, &tls_req->ls_rsp,
lsreq->rqstaddr, lsreq->rqstlen);
+ if (ret)
+ kmem_cache_free(lsreq_cache, tls_req);
+
return ret;
}
@@ -471,18 +493,30 @@ fcloop_t2h_xmt_ls_rsp(struct nvme_fc_local_port *localport,
struct nvmet_fc_target_port *targetport = rport->targetport;
struct fcloop_tport *tport;
+ if (!targetport) {
+ /*
+ * The target port is gone. The target doesn't expect any
+ * response anymore and the ->done call is not valid
+ * because the resources have been freed by
+ * nvmet_fc_free_pending_reqs.
+ *
+ * We end up here from delete association exchange:
+ * nvmet_fc_xmt_disconnect_assoc sends an async request.
+ */
+ kmem_cache_free(lsreq_cache, tls_req);
+ return 0;
+ }
+
memcpy(lsreq->rspaddr, lsrsp->rspbuf,
((lsreq->rsplen < lsrsp->rsplen) ?
lsreq->rsplen : lsrsp->rsplen));
lsrsp->done(lsrsp);
- if (targetport) {
- tport = targetport->private;
- spin_lock(&tport->lock);
- list_add_tail(&tls_req->ls_list, &tport->ls_list);
- spin_unlock(&tport->lock);
- queue_work(nvmet_wq, &tport->ls_work);
- }
+ tport = targetport->private;
+ spin_lock(&tport->lock);
+ list_add_tail(&tls_req->ls_list, &tport->ls_list);
+ spin_unlock(&tport->lock);
+ queue_work(nvmet_wq, &tport->ls_work);
return 0;
}
@@ -566,7 +600,8 @@ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
}
/* release original io reference on tgt struct */
- fcloop_tfcp_req_put(tfcp_req);
+ if (tfcp_req)
+ fcloop_tfcp_req_put(tfcp_req);
}
static bool drop_fabric_opcode;
@@ -618,12 +653,13 @@ fcloop_fcp_recv_work(struct work_struct *work)
{
struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
- struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ struct nvmefc_fcp_req *fcpreq;
unsigned long flags;
int ret = 0;
bool aborted = false;
spin_lock_irqsave(&tfcp_req->reqlock, flags);
+ fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) {
case INI_IO_START:
tfcp_req->inistate = INI_IO_ACTIVE;
@@ -638,16 +674,19 @@ fcloop_fcp_recv_work(struct work_struct *work)
}
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
- if (unlikely(aborted))
- ret = -ECANCELED;
- else {
- if (likely(!check_for_drop(tfcp_req)))
- ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
- &tfcp_req->tgt_fcp_req,
- fcpreq->cmdaddr, fcpreq->cmdlen);
- else
- pr_info("%s: dropped command ********\n", __func__);
+ if (unlikely(aborted)) {
+ /* the abort handler will call fcloop_call_host_done */
+ return;
+ }
+
+ if (unlikely(check_for_drop(tfcp_req))) {
+ pr_info("%s: dropped command ********\n", __func__);
+ return;
}
+
+ ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+ &tfcp_req->tgt_fcp_req,
+ fcpreq->cmdaddr, fcpreq->cmdlen);
if (ret)
fcloop_call_host_done(fcpreq, tfcp_req, ret);
}
@@ -662,15 +701,17 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
unsigned long flags;
spin_lock_irqsave(&tfcp_req->reqlock, flags);
- fcpreq = tfcp_req->fcpreq;
switch (tfcp_req->inistate) {
case INI_IO_ABORTED:
+ fcpreq = tfcp_req->fcpreq;
+ tfcp_req->fcpreq = NULL;
break;
case INI_IO_COMPLETED:
completed = true;
break;
default:
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
+ fcloop_tfcp_req_put(tfcp_req);
WARN_ON(1);
return;
}
@@ -686,10 +727,6 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
&tfcp_req->tgt_fcp_req);
- spin_lock_irqsave(&tfcp_req->reqlock, flags);
- tfcp_req->fcpreq = NULL;
- spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
-
fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
/* call_host_done releases reference for abort downcall */
}
@@ -958,13 +995,16 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
spin_lock(&inireq->inilock);
tfcp_req = inireq->tfcp_req;
- if (tfcp_req)
- fcloop_tfcp_req_get(tfcp_req);
+ if (tfcp_req) {
+ if (!fcloop_tfcp_req_get(tfcp_req))
+ tfcp_req = NULL;
+ }
spin_unlock(&inireq->inilock);
- if (!tfcp_req)
+ if (!tfcp_req) {
/* abort has already been called */
- return;
+ goto out_host_done;
+ }
/* break initiator/target relationship for io */
spin_lock_irqsave(&tfcp_req->reqlock, flags);
@@ -979,7 +1019,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
default:
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
WARN_ON(1);
- return;
+ goto out_host_done;
}
spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
@@ -993,6 +1033,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
*/
fcloop_tfcp_req_put(tfcp_req);
}
+
+ return;
+
+out_host_done:
+ fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
}
static void
@@ -1019,9 +1064,18 @@ fcloop_lport_get(struct fcloop_lport *lport)
static void
fcloop_nport_put(struct fcloop_nport *nport)
{
+ unsigned long flags;
+
if (!refcount_dec_and_test(&nport->ref))
return;
+ spin_lock_irqsave(&fcloop_lock, flags);
+ list_del(&nport->nport_list);
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ if (nport->lport)
+ fcloop_lport_put(nport->lport);
+
kfree(nport);
}
@@ -1037,9 +1091,6 @@ fcloop_localport_delete(struct nvme_fc_local_port *localport)
struct fcloop_lport_priv *lport_priv = localport->private;
struct fcloop_lport *lport = lport_priv->lport;
- /* release any threads waiting for the unreg to complete */
- complete(&lport->unreg_done);
-
fcloop_lport_put(lport);
}
@@ -1047,18 +1098,38 @@ static void
fcloop_remoteport_delete(struct nvme_fc_remote_port *remoteport)
{
struct fcloop_rport *rport = remoteport->private;
+ bool put_port = false;
+ unsigned long flags;
flush_work(&rport->ls_work);
- fcloop_nport_put(rport->nport);
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ if (!test_and_set_bit(PORT_DELETED, &rport->flags))
+ put_port = true;
+ rport->nport->rport = NULL;
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ if (put_port)
+ fcloop_nport_put(rport->nport);
}
static void
fcloop_targetport_delete(struct nvmet_fc_target_port *targetport)
{
struct fcloop_tport *tport = targetport->private;
+ bool put_port = false;
+ unsigned long flags;
flush_work(&tport->ls_work);
- fcloop_nport_put(tport->nport);
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ if (!test_and_set_bit(PORT_DELETED, &tport->flags))
+ put_port = true;
+ tport->nport->tport = NULL;
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ if (put_port)
+ fcloop_nport_put(tport->nport);
}
#define FCLOOP_HW_QUEUES 4
@@ -1082,7 +1153,6 @@ static struct nvme_fc_port_template fctemplate = {
/* sizes of additional private data for data structures */
.local_priv_sz = sizeof(struct fcloop_lport_priv),
.remote_priv_sz = sizeof(struct fcloop_rport),
- .lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
};
@@ -1105,7 +1175,6 @@ static struct nvmet_fc_target_template tgttemplate = {
.target_features = 0,
/* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport),
- .lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
};
static ssize_t
@@ -1170,51 +1239,92 @@ out_free_lport:
}
static int
-__wait_localport_unreg(struct fcloop_lport *lport)
+__localport_unreg(struct fcloop_lport *lport)
{
- int ret;
+ return nvme_fc_unregister_localport(lport->localport);
+}
- init_completion(&lport->unreg_done);
+static struct fcloop_nport *
+__fcloop_nport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_nport *nport;
- ret = nvme_fc_unregister_localport(lport->localport);
+ list_for_each_entry(nport, &fcloop_nports, nport_list) {
+ if (nport->node_name != node_name ||
+ nport->port_name != port_name)
+ continue;
- if (!ret)
- wait_for_completion(&lport->unreg_done);
+ if (fcloop_nport_get(nport))
+ return nport;
- return ret;
+ break;
+ }
+
+ return NULL;
}
+static struct fcloop_nport *
+fcloop_nport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_nport *nport;
+ unsigned long flags;
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ nport = __fcloop_nport_lookup(node_name, port_name);
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ return nport;
+}
+
+static struct fcloop_lport *
+__fcloop_lport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_lport *lport;
+
+ list_for_each_entry(lport, &fcloop_lports, lport_list) {
+ if (lport->localport->node_name != node_name ||
+ lport->localport->port_name != port_name)
+ continue;
+
+ if (fcloop_lport_get(lport))
+ return lport;
+
+ break;
+ }
+
+ return NULL;
+}
+
+static struct fcloop_lport *
+fcloop_lport_lookup(u64 node_name, u64 port_name)
+{
+ struct fcloop_lport *lport;
+ unsigned long flags;
+
+ spin_lock_irqsave(&fcloop_lock, flags);
+ lport = __fcloop_lport_lookup(node_name, port_name);
+ spin_unlock_irqrestore(&fcloop_lock, flags);
+
+ return lport;
+}
static ssize_t
fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- struct fcloop_lport *tlport, *lport = NULL;
+ struct fcloop_lport *lport;
u64 nodename, portname;
- unsigned long flags;
int ret;
ret = fcloop_parse_nm_options(dev, &nodename, &portname, buf);
if (ret)
return ret;
- spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tlport, &fcloop_lports, lport_list) {
- if (tlport->localport->node_name == nodename &&
- tlport->localport->port_name == portname) {
- if (!fcloop_lport_get(tlport))
- break;
- lport = tlport;
- break;
- }
- }
- spin_unlock_irqrestore(&fcloop_lock, flags);
-
+ lport = fcloop_lport_lookup(nodename, portname);
if (!lport)
return -ENOENT;
- ret = __wait_localport_unreg(lport);
+ ret = __localport_unreg(lport);
fcloop_lport_put(lport);
return ret ? ret : count;
@@ -1223,8 +1333,8 @@ fcloop_delete_local_port(struct device *dev, struct device_attribute *attr,
static struct fcloop_nport *
fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
{
- struct fcloop_nport *newnport, *nport = NULL;
- struct fcloop_lport *tmplport, *lport = NULL;
+ struct fcloop_nport *newnport, *nport;
+ struct fcloop_lport *lport;
struct fcloop_ctrl_options *opts;
unsigned long flags;
u32 opts_mask = (remoteport) ? RPORT_OPTS : TGTPORT_OPTS;
@@ -1239,10 +1349,8 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
goto out_free_opts;
/* everything there ? */
- if ((opts->mask & opts_mask) != opts_mask) {
- ret = -EINVAL;
+ if ((opts->mask & opts_mask) != opts_mask)
goto out_free_opts;
- }
newnport = kzalloc(sizeof(*newnport), GFP_KERNEL);
if (!newnport)
@@ -1258,60 +1366,61 @@ fcloop_alloc_nport(const char *buf, size_t count, bool remoteport)
refcount_set(&newnport->ref, 1);
spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tmplport, &fcloop_lports, lport_list) {
- if (tmplport->localport->node_name == opts->wwnn &&
- tmplport->localport->port_name == opts->wwpn)
- goto out_invalid_opts;
-
- if (tmplport->localport->node_name == opts->lpwwnn &&
- tmplport->localport->port_name == opts->lpwwpn)
- lport = tmplport;
+ lport = __fcloop_lport_lookup(opts->wwnn, opts->wwpn);
+ if (lport) {
+ /* invalid configuration */
+ fcloop_lport_put(lport);
+ goto out_free_newnport;
}
if (remoteport) {
- if (!lport)
- goto out_invalid_opts;
- newnport->lport = lport;
- }
-
- list_for_each_entry(nport, &fcloop_nports, nport_list) {
- if (nport->node_name == opts->wwnn &&
- nport->port_name == opts->wwpn) {
- if ((remoteport && nport->rport) ||
- (!remoteport && nport->tport)) {
- nport = NULL;
- goto out_invalid_opts;
- }
-
- fcloop_nport_get(nport);
-
- spin_unlock_irqrestore(&fcloop_lock, flags);
-
- if (remoteport)
- nport->lport = lport;
- if (opts->mask & NVMF_OPT_ROLES)
- nport->port_role = opts->roles;
- if (opts->mask & NVMF_OPT_FCADDR)
- nport->port_id = opts->fcaddr;
+ lport = __fcloop_lport_lookup(opts->lpwwnn, opts->lpwwpn);
+ if (!lport) {
+ /* invalid configuration */
goto out_free_newnport;
}
}
- list_add_tail(&newnport->nport_list, &fcloop_nports);
+ nport = __fcloop_nport_lookup(opts->wwnn, opts->wwpn);
+ if (nport) {
+ if ((remoteport && nport->rport) ||
+ (!remoteport && nport->tport)) {
+ /* invalid configuration */
+ goto out_put_nport;
+ }
+
+ /* found existing nport, discard the new nport */
+ kfree(newnport);
+ } else {
+ list_add_tail(&newnport->nport_list, &fcloop_nports);
+ nport = newnport;
+ }
+ if (opts->mask & NVMF_OPT_ROLES)
+ nport->port_role = opts->roles;
+ if (opts->mask & NVMF_OPT_FCADDR)
+ nport->port_id = opts->fcaddr;
+ if (lport) {
+ if (!nport->lport)
+ nport->lport = lport;
+ else
+ fcloop_lport_put(lport);
+ }
spin_unlock_irqrestore(&fcloop_lock, flags);
kfree(opts);
- return newnport;
+ return nport;
-out_invalid_opts:
- spin_unlock_irqrestore(&fcloop_lock, flags);
+out_put_nport:
+ if (lport)
+ fcloop_lport_put(lport);
+ fcloop_nport_put(nport);
out_free_newnport:
+ spin_unlock_irqrestore(&fcloop_lock, flags);
kfree(newnport);
out_free_opts:
kfree(opts);
- return nport;
+ return NULL;
}
static ssize_t
@@ -1352,6 +1461,7 @@ fcloop_create_remote_port(struct device *dev, struct device_attribute *attr,
rport->nport = nport;
rport->lport = nport->lport;
nport->rport = rport;
+ rport->flags = 0;
spin_lock_init(&rport->lock);
INIT_WORK(&rport->ls_work, fcloop_rport_lsrqst_work);
INIT_LIST_HEAD(&rport->ls_list);
@@ -1365,21 +1475,18 @@ __unlink_remote_port(struct fcloop_nport *nport)
{
struct fcloop_rport *rport = nport->rport;
+ lockdep_assert_held(&fcloop_lock);
+
if (rport && nport->tport)
nport->tport->remoteport = NULL;
nport->rport = NULL;
- list_del(&nport->nport_list);
-
return rport;
}
static int
__remoteport_unreg(struct fcloop_nport *nport, struct fcloop_rport *rport)
{
- if (!rport)
- return -EALREADY;
-
return nvme_fc_unregister_remoteport(rport->remoteport);
}
@@ -1387,8 +1494,8 @@ static ssize_t
fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- struct fcloop_nport *nport = NULL, *tmpport;
- static struct fcloop_rport *rport;
+ struct fcloop_nport *nport;
+ struct fcloop_rport *rport;
u64 nodename, portname;
unsigned long flags;
int ret;
@@ -1397,24 +1504,24 @@ fcloop_delete_remote_port(struct device *dev, struct device_attribute *attr,
if (ret)
return ret;
- spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
- if (tmpport->node_name == nodename &&
- tmpport->port_name == portname && tmpport->rport) {
- nport = tmpport;
- rport = __unlink_remote_port(nport);
- break;
- }
- }
+ nport = fcloop_nport_lookup(nodename, portname);
+ if (!nport)
+ return -ENOENT;
+ spin_lock_irqsave(&fcloop_lock, flags);
+ rport = __unlink_remote_port(nport);
spin_unlock_irqrestore(&fcloop_lock, flags);
- if (!nport)
- return -ENOENT;
+ if (!rport) {
+ ret = -ENOENT;
+ goto out_nport_put;
+ }
ret = __remoteport_unreg(nport, rport);
+out_nport_put:
+ fcloop_nport_put(nport);
+
return ret ? ret : count;
}
@@ -1452,6 +1559,7 @@ fcloop_create_target_port(struct device *dev, struct device_attribute *attr,
tport->nport = nport;
tport->lport = nport->lport;
nport->tport = tport;
+ tport->flags = 0;
spin_lock_init(&tport->lock);
INIT_WORK(&tport->ls_work, fcloop_tport_lsrqst_work);
INIT_LIST_HEAD(&tport->ls_list);
@@ -1465,6 +1573,8 @@ __unlink_target_port(struct fcloop_nport *nport)
{
struct fcloop_tport *tport = nport->tport;
+ lockdep_assert_held(&fcloop_lock);
+
if (tport && nport->rport)
nport->rport->targetport = NULL;
nport->tport = NULL;
@@ -1475,9 +1585,6 @@ __unlink_target_port(struct fcloop_nport *nport)
static int
__targetport_unreg(struct fcloop_nport *nport, struct fcloop_tport *tport)
{
- if (!tport)
- return -EALREADY;
-
return nvmet_fc_unregister_targetport(tport->targetport);
}
@@ -1485,8 +1592,8 @@ static ssize_t
fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
- struct fcloop_nport *nport = NULL, *tmpport;
- struct fcloop_tport *tport = NULL;
+ struct fcloop_nport *nport;
+ struct fcloop_tport *tport;
u64 nodename, portname;
unsigned long flags;
int ret;
@@ -1495,24 +1602,24 @@ fcloop_delete_target_port(struct device *dev, struct device_attribute *attr,
if (ret)
return ret;
- spin_lock_irqsave(&fcloop_lock, flags);
-
- list_for_each_entry(tmpport, &fcloop_nports, nport_list) {
- if (tmpport->node_name == nodename &&
- tmpport->port_name == portname && tmpport->tport) {
- nport = tmpport;
- tport = __unlink_target_port(nport);
- break;
- }
- }
+ nport = fcloop_nport_lookup(nodename, portname);
+ if (!nport)
+ return -ENOENT;
+ spin_lock_irqsave(&fcloop_lock, flags);
+ tport = __unlink_target_port(nport);
spin_unlock_irqrestore(&fcloop_lock, flags);
- if (!nport)
- return -ENOENT;
+ if (!tport) {
+ ret = -ENOENT;
+ goto out_nport_put;
+ }
ret = __targetport_unreg(nport, tport);
+out_nport_put:
+ fcloop_nport_put(nport);
+
return ret ? ret : count;
}
@@ -1578,15 +1685,20 @@ static const struct class fcloop_class = {
};
static struct device *fcloop_device;
-
static int __init fcloop_init(void)
{
int ret;
+ lsreq_cache = kmem_cache_create("lsreq_cache",
+ sizeof(struct fcloop_lsreq), 0,
+ 0, NULL);
+ if (!lsreq_cache)
+ return -ENOMEM;
+
ret = class_register(&fcloop_class);
if (ret) {
pr_err("couldn't register class fcloop\n");
- return ret;
+ goto out_destroy_cache;
}
fcloop_device = device_create_with_groups(
@@ -1604,13 +1716,15 @@ static int __init fcloop_init(void)
out_destroy_class:
class_unregister(&fcloop_class);
+out_destroy_cache:
+ kmem_cache_destroy(lsreq_cache);
return ret;
}
static void __exit fcloop_exit(void)
{
- struct fcloop_lport *lport = NULL;
- struct fcloop_nport *nport = NULL;
+ struct fcloop_lport *lport;
+ struct fcloop_nport *nport;
struct fcloop_tport *tport;
struct fcloop_rport *rport;
unsigned long flags;
@@ -1621,7 +1735,7 @@ static void __exit fcloop_exit(void)
for (;;) {
nport = list_first_entry_or_null(&fcloop_nports,
typeof(*nport), nport_list);
- if (!nport)
+ if (!nport || !fcloop_nport_get(nport))
break;
tport = __unlink_target_port(nport);
@@ -1629,13 +1743,21 @@ static void __exit fcloop_exit(void)
spin_unlock_irqrestore(&fcloop_lock, flags);
- ret = __targetport_unreg(nport, tport);
- if (ret)
- pr_warn("%s: Failed deleting target port\n", __func__);
+ if (tport) {
+ ret = __targetport_unreg(nport, tport);
+ if (ret)
+ pr_warn("%s: Failed deleting target port\n",
+ __func__);
+ }
- ret = __remoteport_unreg(nport, rport);
- if (ret)
- pr_warn("%s: Failed deleting remote port\n", __func__);
+ if (rport) {
+ ret = __remoteport_unreg(nport, rport);
+ if (ret)
+ pr_warn("%s: Failed deleting remote port\n",
+ __func__);
+ }
+
+ fcloop_nport_put(nport);
spin_lock_irqsave(&fcloop_lock, flags);
}
@@ -1648,7 +1770,7 @@ static void __exit fcloop_exit(void)
spin_unlock_irqrestore(&fcloop_lock, flags);
- ret = __wait_localport_unreg(lport);
+ ret = __localport_unreg(lport);
if (ret)
pr_warn("%s: Failed deleting local port\n", __func__);
@@ -1663,6 +1785,7 @@ static void __exit fcloop_exit(void)
device_destroy(&fcloop_class, MKDEV(0, 0));
class_unregister(&fcloop_class);
+ kmem_cache_destroy(lsreq_cache);
}
module_init(fcloop_init);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index a5c41144667c..f85a8441bcc6 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -33,10 +33,12 @@ struct nvme_loop_ctrl {
struct list_head list;
struct blk_mq_tag_set tag_set;
- struct nvme_loop_iod async_event_iod;
struct nvme_ctrl ctrl;
struct nvmet_port *port;
+
+ /* Must be last --ends in a flexible-array member. */
+ struct nvme_loop_iod async_event_iod;
};
static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl)
@@ -148,8 +150,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
nvme_start_request(req);
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
iod->req.port = queue->ctrl->port;
- if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
- &queue->nvme_sq, &nvme_loop_ops))
+ if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops))
return BLK_STS_OK;
if (blk_rq_nr_phys_segments(req)) {
@@ -181,8 +182,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg)
iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
- if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq,
- &nvme_loop_ops)) {
+ if (!nvmet_req_init(&iod->req, &queue->nvme_sq, &nvme_loop_ops)) {
dev_err(ctrl->ctrl.device, "failed async event work\n");
return;
}
@@ -273,6 +273,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
nvme_unquiesce_admin_queue(&ctrl->ctrl);
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
+ nvmet_cq_put(&ctrl->queues[0].nvme_cq);
nvme_remove_admin_tag_set(&ctrl->ctrl);
}
@@ -302,6 +303,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl)
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
+ nvmet_cq_put(&ctrl->queues[i].nvme_cq);
}
ctrl->ctrl.queue_count = 1;
/*
@@ -327,9 +329,13 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
for (i = 1; i <= nr_io_queues; i++) {
ctrl->queues[i].ctrl = ctrl;
- ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq);
- if (ret)
+ nvmet_cq_init(&ctrl->queues[i].nvme_cq);
+ ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq,
+ &ctrl->queues[i].nvme_cq);
+ if (ret) {
+ nvmet_cq_put(&ctrl->queues[i].nvme_cq);
goto out_destroy_queues;
+ }
ctrl->ctrl.queue_count++;
}
@@ -360,9 +366,13 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
int error;
ctrl->queues[0].ctrl = ctrl;
- error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
- if (error)
+ nvmet_cq_init(&ctrl->queues[0].nvme_cq);
+ error = nvmet_sq_init(&ctrl->queues[0].nvme_sq,
+ &ctrl->queues[0].nvme_cq);
+ if (error) {
+ nvmet_cq_put(&ctrl->queues[0].nvme_cq);
return error;
+ }
ctrl->ctrl.queue_count = 1;
error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
@@ -401,6 +411,7 @@ out_cleanup_tagset:
nvme_remove_admin_tag_set(&ctrl->ctrl);
out_free_sq:
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
+ nvmet_cq_put(&ctrl->queues[0].nvme_cq);
return error;
}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index b6db8b74dc4a..df69a9dee71c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -141,13 +141,16 @@ static inline struct device *nvmet_ns_dev(struct nvmet_ns *ns)
}
struct nvmet_cq {
+ struct nvmet_ctrl *ctrl;
u16 qid;
u16 size;
+ refcount_t ref;
};
struct nvmet_sq {
struct nvmet_ctrl *ctrl;
struct percpu_ref ref;
+ struct nvmet_cq *cq;
u16 qid;
u16 size;
u32 sqhd;
@@ -247,6 +250,7 @@ struct nvmet_pr_log_mgr {
struct nvmet_ctrl {
struct nvmet_subsys *subsys;
struct nvmet_sq **sqs;
+ struct nvmet_cq **cqs;
void *drvdata;
@@ -424,7 +428,7 @@ struct nvmet_fabrics_ops {
u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl);
/* Operations mandatory for PCI target controllers */
- u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 flags,
+ u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 cqid, u16 flags,
u16 qsize, u64 prp1);
u16 (*delete_sq)(struct nvmet_ctrl *ctrl, u16 sqid);
u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags,
@@ -557,8 +561,8 @@ u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req);
u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req);
-bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
- struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
+bool nvmet_req_init(struct nvmet_req *req, struct nvmet_sq *sq,
+ const struct nvmet_fabrics_ops *ops);
void nvmet_req_uninit(struct nvmet_req *req);
size_t nvmet_req_transfer_len(struct nvmet_req *req);
bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len);
@@ -571,18 +575,24 @@ void nvmet_execute_set_features(struct nvmet_req *req);
void nvmet_execute_get_features(struct nvmet_req *req);
void nvmet_execute_keep_alive(struct nvmet_req *req);
-u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid);
+u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
+u16 nvmet_check_io_cqid(struct nvmet_ctrl *ctrl, u16 cqid, bool create);
+void nvmet_cq_init(struct nvmet_cq *cq);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
+void nvmet_cq_destroy(struct nvmet_cq *cq);
+bool nvmet_cq_get(struct nvmet_cq *cq);
+void nvmet_cq_put(struct nvmet_cq *cq);
+bool nvmet_cq_in_use(struct nvmet_cq *cq);
u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create);
void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
u16 size);
-u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
- u16 size);
+u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq,
+ struct nvmet_cq *cq, u16 qid, u16 size);
void nvmet_sq_destroy(struct nvmet_sq *sq);
-int nvmet_sq_init(struct nvmet_sq *sq);
+int nvmet_sq_init(struct nvmet_sq *sq, struct nvmet_cq *cq);
void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl);
diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c
index 7123c855b5a6..a4295a5b8d28 100644
--- a/drivers/nvme/target/pci-epf.c
+++ b/drivers/nvme/target/pci-epf.c
@@ -1354,15 +1354,17 @@ static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid)
if (test_and_clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags))
nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector);
nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &cq->pci_map);
+ nvmet_cq_put(&cq->nvme_cq);
return NVME_SC_SUCCESS;
}
static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
- u16 sqid, u16 flags, u16 qsize, u64 pci_addr)
+ u16 sqid, u16 cqid, u16 flags, u16 qsize, u64 pci_addr)
{
struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata;
struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid];
+ struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid];
u16 status;
if (test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags))
@@ -1385,7 +1387,8 @@ static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl,
sq->qes = ctrl->io_sqes;
sq->pci_size = sq->qes * sq->depth;
- status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth);
+ status = nvmet_sq_create(tctrl, &sq->nvme_sq, &cq->nvme_cq, sqid,
+ sq->depth);
if (status != NVME_SC_SUCCESS)
return status;
@@ -1601,8 +1604,7 @@ static void nvmet_pci_epf_exec_iod_work(struct work_struct *work)
goto complete;
}
- if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq,
- &nvmet_pci_epf_fabrics_ops))
+ if (!nvmet_req_init(req, &iod->sq->nvme_sq, &nvmet_pci_epf_fabrics_ops))
goto complete;
iod->data_len = nvmet_req_transfer_len(req);
@@ -1879,8 +1881,8 @@ static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl)
qsize = aqa & 0x00000fff;
pci_addr = asq & GENMASK_ULL(63, 12);
- status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG,
- qsize, pci_addr);
+ status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, 0,
+ NVME_QUEUE_PHYS_CONTIG, qsize, pci_addr);
if (status != NVME_SC_SUCCESS) {
dev_err(ctrl->dev, "Failed to create admin submission queue\n");
nvmet_pci_epf_delete_cq(ctrl->tctrl, 0);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 2a4536ef6184..432bdf7cd49e 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -976,8 +976,7 @@ static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
cmd->send_sge.addr, cmd->send_sge.length,
DMA_TO_DEVICE);
- if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
- &queue->nvme_sq, &nvmet_rdma_ops))
+ if (!nvmet_req_init(&cmd->req, &queue->nvme_sq, &nvmet_rdma_ops))
return;
status = nvmet_rdma_map_sgl(cmd);
@@ -1353,6 +1352,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
pr_debug("freeing queue %d\n", queue->idx);
nvmet_sq_destroy(&queue->nvme_sq);
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_rdma_destroy_queue_ib(queue);
if (!queue->nsrq) {
@@ -1436,7 +1436,8 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
goto out_reject;
}
- ret = nvmet_sq_init(&queue->nvme_sq);
+ nvmet_cq_init(&queue->nvme_cq);
+ ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret) {
ret = NVME_RDMA_CM_NO_RSC;
goto out_free_queue;
@@ -1517,6 +1518,7 @@ out_ida_remove:
out_destroy_sq:
nvmet_sq_destroy(&queue->nvme_sq);
out_free_queue:
+ nvmet_cq_put(&queue->nvme_cq);
kfree(queue);
out_reject:
nvmet_rdma_cm_reject(cm_id, ret);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 12a5cb8641ca..c6603bd9c95e 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -7,6 +7,7 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/crc32c.h>
#include <linux/err.h>
#include <linux/nvme-tcp.h>
#include <linux/nvme-keyring.h>
@@ -17,7 +18,6 @@
#include <net/handshake.h>
#include <linux/inet.h>
#include <linux/llist.h>
-#include <crypto/hash.h>
#include <trace/events/sock.h>
#include "nvmet.h"
@@ -172,8 +172,6 @@ struct nvmet_tcp_queue {
/* digest state */
bool hdr_digest;
bool data_digest;
- struct ahash_request *snd_hash;
- struct ahash_request *rcv_hash;
/* TLS state */
key_serial_t tls_pskid;
@@ -294,14 +292,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
}
-static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
- void *pdu, size_t len)
+static inline void nvmet_tcp_hdgst(void *pdu, size_t len)
{
- struct scatterlist sg;
-
- sg_init_one(&sg, pdu, len);
- ahash_request_set_crypt(hash, &sg, pdu + len, len);
- crypto_ahash_digest(hash);
+ put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len);
}
static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
@@ -318,7 +311,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
}
recv_digest = *(__le32 *)(pdu + hdr->hlen);
- nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
+ nvmet_tcp_hdgst(pdu, len);
exp_digest = *(__le32 *)(pdu + hdr->hlen);
if (recv_digest != exp_digest) {
pr_err("queue %d: header digest error: recv %#x expected %#x\n",
@@ -441,12 +434,24 @@ err:
return NVME_SC_INTERNAL;
}
-static void nvmet_tcp_calc_ddgst(struct ahash_request *hash,
- struct nvmet_tcp_cmd *cmd)
+static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd)
{
- ahash_request_set_crypt(hash, cmd->req.sg,
- (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
- crypto_ahash_digest(hash);
+ size_t total_len = cmd->req.transfer_len;
+ struct scatterlist *sg = cmd->req.sg;
+ u32 crc = ~0;
+
+ while (total_len) {
+ size_t len = min_t(size_t, total_len, sg->length);
+
+ /*
+ * Note that the scatterlist does not contain any highmem pages,
+ * as it was allocated by sgl_alloc() with GFP_KERNEL.
+ */
+ crc = crc32c(crc, sg_virt(sg), len);
+ total_len -= len;
+ sg = sg_next(sg);
+ }
+ cmd->exp_ddgst = cpu_to_le32(~crc);
}
static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
@@ -473,19 +478,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
if (queue->data_digest) {
pdu->hdr.flags |= NVME_TCP_F_DDGST;
- nvmet_tcp_calc_ddgst(queue->snd_hash, cmd);
+ nvmet_tcp_calc_ddgst(cmd);
}
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
- struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
cmd->offset = 0;
@@ -503,14 +507,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
- struct nvmet_tcp_queue *queue = cmd->queue;
u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
cmd->offset = 0;
@@ -523,7 +526,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
if (cmd->queue->hdr_digest) {
pdu->hdr.flags |= NVME_TCP_F_HDGST;
- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+ nvmet_tcp_hdgst(pdu, sizeof(*pdu));
}
}
@@ -857,42 +860,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU);
}
-static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
-{
- struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
-
- ahash_request_free(queue->rcv_hash);
- ahash_request_free(queue->snd_hash);
- crypto_free_ahash(tfm);
-}
-
-static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
-{
- struct crypto_ahash *tfm;
-
- tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
-
- queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!queue->snd_hash)
- goto free_tfm;
- ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
-
- queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
- if (!queue->rcv_hash)
- goto free_snd_hash;
- ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
-
- return 0;
-free_snd_hash:
- ahash_request_free(queue->snd_hash);
-free_tfm:
- crypto_free_ahash(tfm);
- return -ENOMEM;
-}
-
-
static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
{
struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
@@ -921,11 +888,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
- if (queue->hdr_digest || queue->data_digest) {
- ret = nvmet_tcp_alloc_crypto(queue);
- if (ret)
- return ret;
- }
memset(icresp, 0, sizeof(*icresp));
icresp->hdr.type = nvme_tcp_icresp;
@@ -1077,8 +1039,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
req = &queue->cmd->req;
memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
- if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
- &queue->nvme_sq, &nvmet_tcp_ops))) {
+ if (unlikely(!nvmet_req_init(req, &queue->nvme_sq, &nvmet_tcp_ops))) {
pr_err("failed cmd %p id %d opcode %d, data_len: %d, status: %04x\n",
req->cmd, req->cmd->common.command_id,
req->cmd->common.opcode,
@@ -1247,7 +1208,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
{
struct nvmet_tcp_queue *queue = cmd->queue;
- nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd);
+ nvmet_tcp_calc_ddgst(cmd);
queue->offset = 0;
queue->left = NVME_TCP_DIGEST_LENGTH;
queue->rcv_state = NVMET_TCP_RECV_DDGST;
@@ -1615,13 +1576,12 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
nvmet_sq_put_tls_key(&queue->nvme_sq);
nvmet_tcp_uninit_data_in_cmds(queue);
nvmet_sq_destroy(&queue->nvme_sq);
+ nvmet_cq_put(&queue->nvme_cq);
cancel_work_sync(&queue->io_work);
nvmet_tcp_free_cmd_data_in_buffers(queue);
/* ->sock will be released by fput() */
fput(queue->sock->file);
nvmet_tcp_free_cmds(queue);
- if (queue->hdr_digest || queue->data_digest)
- nvmet_tcp_free_crypto(queue);
ida_free(&nvmet_tcp_queue_ida, queue->idx);
page_frag_cache_drain(&queue->pf_cache);
kfree(queue);
@@ -1950,7 +1910,8 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
if (ret)
goto out_ida_remove;
- ret = nvmet_sq_init(&queue->nvme_sq);
+ nvmet_cq_init(&queue->nvme_cq);
+ ret = nvmet_sq_init(&queue->nvme_sq, &queue->nvme_cq);
if (ret)
goto out_free_connect;
@@ -1993,6 +1954,7 @@ out_destroy_sq:
mutex_unlock(&nvmet_tcp_queue_mutex);
nvmet_sq_destroy(&queue->nvme_sq);
out_free_connect:
+ nvmet_cq_put(&queue->nvme_cq);
nvmet_tcp_free_cmd(&queue->connect);
out_ida_remove:
ida_free(&nvmet_tcp_queue_ida, queue->idx);
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index 5a3c670aec27..5522310bab8d 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -403,6 +403,7 @@ config SCSI_ACARD
config SCSI_AHA152X
tristate "Adaptec AHA152X/2825 support"
depends on ISA && SCSI
+ depends on !HIGHMEM
select SCSI_SPI_ATTRS
select CHECK_SIGNATURE
help
@@ -795,6 +796,7 @@ config SCSI_PPA
tristate "IOMEGA parallel port (ppa - older drives)"
depends on SCSI && PARPORT_PC
depends on HAS_IOPORT
+ depends on !HIGHMEM
help
This driver supports older versions of IOMEGA's parallel port ZIP
drive (a 100 MB removable media device).
@@ -822,6 +824,7 @@ config SCSI_PPA
config SCSI_IMM
tristate "IOMEGA parallel port (imm - newer drives)"
depends on SCSI && PARPORT_PC
+ depends on !HIGHMEM
help
This driver supports newer versions of IOMEGA's parallel port ZIP
drive (a 100 MB removable media device).
diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index 4276f868cd91..e94c0a19c435 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -746,7 +746,6 @@ struct Scsi_Host *aha152x_probe_one(struct aha152x_setup *setup)
/* need to have host registered before triggering any interrupt */
list_add_tail(&HOSTDATA(shpnt)->host_list, &aha152x_host_list);
- shpnt->no_highmem = true;
shpnt->io_port = setup->io_port;
shpnt->n_io_port = IO_RANGE;
shpnt->irq = setup->irq;
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 1d4c7310f1a6..0821cf994b98 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -1224,7 +1224,6 @@ static int __imm_attach(struct parport *pb)
host = scsi_host_alloc(&imm_template, sizeof(imm_struct *));
if (!host)
goto out1;
- host->no_highmem = true;
host->io_port = pb->base;
host->n_io_port = ports;
host->dma_channel = -1;
diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c
index a06329b47851..1ed3171f1797 100644
--- a/drivers/scsi/ppa.c
+++ b/drivers/scsi/ppa.c
@@ -1104,7 +1104,6 @@ static int __ppa_attach(struct parport *pb)
host = scsi_host_alloc(&ppa_template, sizeof(ppa_struct *));
if (!host)
goto out1;
- host->no_highmem = true;
host->io_port = pb->base;
host->n_io_port = ports;
host->dma_channel = -1;
diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c
index 2fa45556e1ea..0ddc95bafc71 100644
--- a/drivers/scsi/scsi_ioctl.c
+++ b/drivers/scsi/scsi_ioctl.c
@@ -601,7 +601,7 @@ static int sg_scsi_ioctl(struct request_queue *q, bool open_for_write,
}
if (bytes) {
- err = blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO);
+ err = blk_rq_map_kern(rq, buffer, bytes, GFP_NOIO);
if (err)
goto error;
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1b43013d72c0..144c72f0737a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -313,8 +313,7 @@ retry:
return PTR_ERR(req);
if (bufflen) {
- ret = blk_rq_map_kern(sdev->request_queue, req,
- buffer, bufflen, GFP_NOIO);
+ ret = blk_rq_map_kern(req, buffer, bufflen, GFP_NOIO);
if (ret)
goto out;
}
@@ -2004,9 +2003,6 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
lim->dma_alignment = max_t(unsigned int,
shost->dma_alignment, dma_get_cache_alignment() - 1);
- if (shost->no_highmem)
- lim->features |= BLK_FEAT_BOUNCE_HIGH;
-
/*
* Propagate the DMA formation properties to the dma-mapping layer as
* a courtesy service to the LLDDs. This needs to check that the buses
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index d36f3b6992bb..152ee3376550 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -1056,13 +1056,20 @@ int usb_stor_probe1(struct us_data **pus,
goto BadDevice;
/*
- * Some USB host controllers can't do DMA; they have to use PIO.
- * For such controllers we need to make sure the block layer sets
- * up bounce buffers in addressable memory.
+ * Some USB host controllers can't do DMA: They have to use PIO, or they
+ * have to use a small dedicated local memory area, or they have other
+ * restrictions on addressable memory.
+ *
+ * We can't support these controllers on highmem systems as we don't
+ * kmap or bounce buffer.
*/
- if (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
- bus_to_hcd(us->pusb_dev->bus)->localmem_pool)
- host->no_highmem = true;
+ if (IS_ENABLED(CONFIG_HIGHMEM) &&
+ (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
+ bus_to_hcd(us->pusb_dev->bus)->localmem_pool)) {
+ dev_warn(&intf->dev, "USB Mass Storage not supported on this host controller\n");
+ result = -EINVAL;
+ goto release;
+ }
/* Get the unusual_devs entries and the descriptors */
result = get_device_info(us, id, unusual_dev);
@@ -1081,6 +1088,7 @@ int usb_stor_probe1(struct us_data **pus,
BadDevice:
usb_stor_dbg(us, "storage_probe() failed\n");
+release:
release_everything(us);
return result;
}