From c6ea70604249bc357ce09e9f8e16c29df0fb2fa2 Mon Sep 17 00:00:00 2001 From: "dougmill@linux.vnet.ibm.com" Date: Tue, 16 Aug 2022 15:07:13 +0100 Subject: block: sed-opal: Add ioctl to return device status Provide a mechanism to retrieve basic status information about the device, including the "supported" flag indicating whether SED-OPAL is supported. The information returned is from the various feature descriptors received during the discovery0 step, and so this ioctl does nothing more than perform the discovery0 step and then save the information received. See "struct opal_status" and OPAL_FL_* bits for the status information currently returned. This is necessary to be able to check whether a device is OPAL enabled, set up, locked or unlocked from userspace programs like systemd-cryptsetup and libcryptsetup. Right now we just have to assume the user 'knows' or blindly attempt setup/lock/unlock operations. Signed-off-by: Douglas Miller Tested-by: Luca Boccassi Reviewed-by: Scott Bauer Acked-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220816140713.84893-1-luca.boccassi@gmail.com Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 1ac0d712a9c3..6f837bb6c715 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -43,6 +43,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_MBR_DONE: case IOC_OPAL_WRITE_SHADOW_MBR: case IOC_OPAL_GENERIC_TABLE_RW: + case IOC_OPAL_GET_STATUS: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 6f5af1a84213..2573772e2fb3 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -132,6 +132,18 @@ struct opal_read_write_table { __u64 priv; }; +#define OPAL_FL_SUPPORTED 0x00000001 +#define OPAL_FL_LOCKING_SUPPORTED 0x00000002 +#define OPAL_FL_LOCKING_ENABLED 0x00000004 +#define OPAL_FL_LOCKED 0x00000008 +#define OPAL_FL_MBR_ENABLED 0x00000010 +#define OPAL_FL_MBR_DONE 0x00000020 + +struct opal_status { + __u32 flags; + __u32 reserved; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -148,5 +160,6 @@ struct opal_read_write_table { #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) #define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) +#define IOC_OPAL_GET_STATUS _IOR('p', 236, struct opal_status) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From a4e1d0b76e7b32c0839e72679c530445172a2564 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 15 Aug 2022 10:00:43 -0700 Subject: block: Change the return type of blk_mq_map_queues() into void Since blk_mq_map_queues() and the .map_queues() callbacks always return 0, change their return type into void. Most callers ignore the returned value anyway. Cc: Christoph Hellwig Cc: Jason Wang Cc: Keith Busch Cc: Martin K. Petersen Cc: Doug Gilbert Cc: Michael S. Tsirkin Signed-off-by: Bart Van Assche Reviewed-by: John Garry Acked-by: Md Haris Iqbal Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20220815170043.19489-3-bvanassche@acm.org [axboe: fold in fix from Bart] Signed-off-by: Jens Axboe --- include/linux/blk-mq-pci.h | 4 ++-- include/linux/blk-mq-rdma.h | 2 +- include/linux/blk-mq-virtio.h | 2 +- include/linux/blk-mq.h | 4 ++-- include/scsi/scsi_host.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h index 0b1f45c62623..ca544e1d3508 100644 --- a/include/linux/blk-mq-pci.h +++ b/include/linux/blk-mq-pci.h @@ -5,7 +5,7 @@ struct blk_mq_queue_map; struct pci_dev; -int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset); +void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset); #endif /* _LINUX_BLK_MQ_PCI_H */ diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h index 5cc5f0f36218..53b58c610e76 100644 --- a/include/linux/blk-mq-rdma.h +++ b/include/linux/blk-mq-rdma.h @@ -5,7 +5,7 @@ struct blk_mq_tag_set; struct ib_device; -int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, +void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec); #endif /* _LINUX_BLK_MQ_RDMA_H */ diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h index 687ae287e1dc..13226e9b22dd 100644 --- a/include/linux/blk-mq-virtio.h +++ b/include/linux/blk-mq-virtio.h @@ -5,7 +5,7 @@ struct blk_mq_queue_map; struct virtio_device; -int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, +void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec); #endif /* _LINUX_BLK_MQ_VIRTIO_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 92294a5fb083..c38575209d51 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -630,7 +630,7 @@ struct blk_mq_ops { * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ - int (*map_queues)(struct blk_mq_tag_set *set); + void (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** @@ -880,7 +880,7 @@ void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_map_queues(struct blk_mq_queue_map *qmap); +void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index aa7b7496c93a..7d51af3e7c40 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -276,7 +276,7 @@ struct scsi_host_template { * * Status: OPTIONAL */ - int (* map_queues)(struct Scsi_Host *shost); + void (* map_queues)(struct Scsi_Host *shost); /* * SCSI interface of blk_poll - poll for IO completions. -- cgit v1.2.3 From f5d632d15e9e0a037339601680d82bb840f85d10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Aug 2022 16:39:04 -0600 Subject: block: shrink rq_map_data a bit We don't need full ints for several of these members. Change the page_order and nr_entries to unsigned shorts, and the true/false from_user and null_mapped to booleans. This shrinks the struct from 32 to 24 bytes on 64-bit archs. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c38575209d51..74b99d716b0b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -963,11 +963,11 @@ blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; - int page_order; - int nr_entries; unsigned long offset; - int null_mapped; - int from_user; + unsigned short page_order; + unsigned short nr_entries; + bool null_mapped; + bool from_user; }; int blk_rq_map_user(struct request_queue *, struct request *, -- cgit v1.2.3 From 16ede66973c84f890c03584f79158dd5b2d725f5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 25 Aug 2022 07:53:12 -0700 Subject: sbitmap: fix batched wait_cnt accounting Batched completions can clear multiple bits, but we're only decrementing the wait_cnt by one each time. This can cause waiters to never be woken, stalling IO. Use the batched count instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679 Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220825145312.1217900-1-kbusch@fb.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 8f5a86e210b9..4d2d5205ab58 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. + * @nr: Number of bits cleared. */ -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct -- cgit v1.2.3 From bce1b56c73826fec8caf6187f0c922ede397a5a8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 4 Sep 2022 06:39:25 -0600 Subject: Revert "sbitmap: fix batched wait_cnt accounting" This reverts commit 16ede66973c84f890c03584f79158dd5b2d725f5. This is causing issues with CPU stalls on my test box, revert it for now until we understand what is going on. It looks like infinite looping off sbitmap_queue_wake_up(), but hard to tell with a lot of CPUs hitting this issue and the console scrolling infinitely. Link: https://lore.kernel.org/linux-block/e742813b-ce5c-0d58-205b-1626f639b1bd@kernel.dk/ Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 4d2d5205ab58..8f5a86e210b9 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -575,9 +575,8 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. - * @nr: Number of bits cleared. */ -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct -- cgit v1.2.3 From 4acb83417cadfdcbe64215f9d0ddcf3132af808e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 9 Sep 2022 11:40:22 -0700 Subject: sbitmap: fix batched wait_cnt accounting Batched completions can clear multiple bits, but we're only decrementing the wait_cnt by one each time. This can cause waiters to never be woken, stalling IO. Use the batched count instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215679 Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220909184022.1709476-1-kbusch@fb.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 8f5a86e210b9..4d2d5205ab58 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -575,8 +575,9 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. + * @nr: Number of bits cleared. */ -void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); +void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct -- cgit v1.2.3 From 320fb0f91e55ba248d4bad106b408e59099cfa89 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 Aug 2022 10:22:37 +0800 Subject: blk-throttle: fix that io throttle can only work for single bio Test scripts: cd /sys/fs/cgroup/blkio/ echo "8:0 1024" > blkio.throttle.write_bps_device echo $$ > cgroup.procs dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & dd if=/dev/zero of=/dev/sda bs=10k count=1 oflag=direct & Test result: 10240 bytes (10 kB, 10 KiB) copied, 10.0134 s, 1.0 kB/s 10240 bytes (10 kB, 10 KiB) copied, 10.0135 s, 1.0 kB/s The problem is that the second bio is finished after 10s instead of 20s. Root cause: 1) second bio will be flagged: __blk_throtl_bio while (true) { ... if (sq->nr_queued[rw]) -> some bio is throttled already break }; bio_set_flag(bio, BIO_THROTTLED); -> flag the bio 2) flagged bio will be dispatched without waiting: throtl_dispatch_tg tg_may_dispatch tg_with_in_bps_limit if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) *wait = 0; -> wait time is zero return true; commit 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") support to count split bios for iops limit, thus it adds flagged bio checking in tg_with_in_bps_limit() so that split bios will only count once for bps limit, however, it introduce a new problem that io throttle won't work if multiple bios are throttled. In order to fix the problem, handle iops/bps limit in different ways: 1) for iops limit, there is no flag to record if the bio is throttled, and iops is always applied. 2) for bps limit, original bio will be flagged with BIO_BPS_THROTTLED, and io throttle will ignore bio with the flag. Noted this patch also remove the code to set flag in __bio_clone(), it's introduced in commit 111be8839817 ("block-throttle: avoid double charge"), and author thinks split bio can be resubmited and throttled again, which is wrong because split bio will continue to dispatch from caller. Fixes: 9f5ede3c01f9 ("block: throttle split bio in case of iops limit") Cc: Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220829022240.3348319-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- include/linux/blk_types.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index ca22b06700a9..2c5806997bbf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -509,7 +509,7 @@ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) - bio_clear_flag(bio, BIO_THROTTLED); + bio_clear_flag(bio, BIO_BPS_THROTTLED); bio->bi_bdev = bdev; bio_associate_blkg(bio); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1ef99790f6ed..41afb4cfb9b0 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -325,7 +325,7 @@ enum { BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ - BIO_THROTTLED, /* This bio has already been subjected to + BIO_BPS_THROTTLED, /* This bio has already been subjected to * throttling rules. Don't do it again. */ BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ -- cgit v1.2.3 From 176042404ee6a96ba7e9054e1bda6220360a26ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:41:56 +0100 Subject: mm: add PSI accounting around ->read_folio and ->readahead calls PSI tries to account for the cost of bringing back in pages discarded by the MM LRU management. Currently the prime place for that is hooked into the bio submission path, which is a rather bad place: - it does not actually account I/O for non-block file systems, of which we have many - it adds overhead and a layering violation to the block layer Add the accounting into the two places in the core MM code that read pages into an address space by calling into ->read_folio and ->readahead so that the entire file system operations are covered, to broaden the coverage and allow removing the accounting in the block layer going forward. As psi_memstall_enter can deal with nested calls this will not lead to double accounting even while the bio annotations are still present. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0178b2040ea3..201dc7281640 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1173,6 +1173,8 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool _workingset; + unsigned long _pflags; }; #define DEFINE_READAHEAD(ractl, f, r, m, i) \ -- cgit v1.2.3 From 118f3663fbc658e9ad6165e129076981c7b685c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Sep 2022 10:42:00 +0100 Subject: block: remove PSI accounting from the bio layer PSI accounting is now done by the VM code, where it should have been since the beginning. Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220915094200.139713-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 41afb4cfb9b0..e0b098089ef2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -321,7 +321,6 @@ enum { BIO_NO_PAGE_REF, /* don't put release vec pages */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ - BIO_WORKINGSET, /* contains userspace workingset pages */ BIO_QUIET, /* Make BIO Quiet */ BIO_CHAIN, /* chained bio, ->bi_remaining in effect */ BIO_REFFED, /* bio has elevated ->bi_cnt */ -- cgit v1.2.3 From b2bed51a5261f4266ecb857bba680a7f668d3ddf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 20 Sep 2022 13:06:26 -0700 Subject: block: Fix the enum blk_eh_timer_return documentation The documentation of the blk_eh_timer_return enumeration values does not reflect correctly how e.g. the SCSI core uses these values. Fix the documentation. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Damien Le Moal Cc: Johannes Thumshirn Fixes: 88b0cfad2888 ("block: document the blk_eh_timer_return values") Signed-off-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20220920200626.3422296-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 74b99d716b0b..e21ff85751cf 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -268,9 +268,16 @@ static inline void rq_list_move(struct request **src, struct request **dst, rq_list_add(dst, rq); } +/** + * enum blk_eh_timer_return - How the timeout handler should proceed + * @BLK_EH_DONE: The block driver completed the command or will complete it at + * a later time. + * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the + * request to complete. + */ enum blk_eh_timer_return { - BLK_EH_DONE, /* drivers has completed the command */ - BLK_EH_RESET_TIMER, /* reset timer and try again */ + BLK_EH_DONE, + BLK_EH_RESET_TIMER, }; #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ -- cgit v1.2.3 From 77a440e2cbb4b8688b567104f80ce1cda1afbbc4 Mon Sep 17 00:00:00 2001 From: ZiyangZhang Date: Fri, 23 Sep 2022 23:39:14 +0800 Subject: ublk_drv: define macros for recovery feature and check them Define some macros for recovery feature. UBLK_S_DEV_QUIESCED implies that ublk_device is quiesced and is ready for recovery. This state can be observed by userspace. UBLK_F_USER_RECOVERY implies that: (1) ublk_drv enables recovery feature. It won't let monitor_work to automatically abort rqs and release the device. (2) With a dying ubq_daemon, ublk_drv ends(aborts) rqs issued to userspace(ublksrv) before crash. (3) With a dying ubq_daemon, in task work and ublk_queue_rq(), ublk_drv requeues rqs. Signed-off-by: ZiyangZhang Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220923153919.44078-3-ZiyangZhang@linux.alibaba.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 677edaab2b66..340ff14bde49 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -74,9 +74,12 @@ */ #define UBLK_F_NEED_GET_DATA (1UL << 2) +#define UBLK_F_USER_RECOVERY (1UL << 3) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 +#define UBLK_S_DEV_QUIESCED 2 /* shipped via sqe->cmd of io_uring command */ struct ublksrv_ctrl_cmd { -- cgit v1.2.3 From a0d41dc1137470fd4c5c2ef8fdc244d7565e69e6 Mon Sep 17 00:00:00 2001 From: ZiyangZhang Date: Fri, 23 Sep 2022 23:39:17 +0800 Subject: ublk_drv: support UBLK_F_USER_RECOVERY_REISSUE UBLK_F_USER_RECOVERY_REISSUE implies that: With a dying ubq_daemon, ublk_drv let monitor_work requeues rq issued to userspace(ublksrv) before the ubq_daemon is dying. UBLK_F_USER_RECOVERY_REISSUE is designed for backends which: (1) tolerate double-write since ublk_drv may issue the same rq twice. (2) does not let frontend users get I/O error, such as read-only FS and VM backend. Signed-off-by: ZiyangZhang Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220923153919.44078-6-ZiyangZhang@linux.alibaba.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 340ff14bde49..332370628757 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -76,6 +76,8 @@ #define UBLK_F_USER_RECOVERY (1UL << 3) +#define UBLK_F_USER_RECOVERY_REISSUE (1UL << 4) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 -- cgit v1.2.3 From c732a852b419fa057b53657e2daaf9433940391c Mon Sep 17 00:00:00 2001 From: ZiyangZhang Date: Fri, 23 Sep 2022 23:39:18 +0800 Subject: ublk_drv: add START_USER_RECOVERY and END_USER_RECOVERY support START_USER_RECOVERY and END_USER_RECOVERY are two new control commands to support user recovery feature. After a crash, user should send START_USER_RECOVERY, it will: (1) check if (a)current ublk_device is UBLK_S_DEV_QUIESCED which was set by quiesce_work and (b)chardev is released (2) reinit all ubqs, including: (a) put the task_struct and reset ->ubq_daemon to NULL. (b) reset all ublk_io. (3) reset ub->mm to NULL. Then, user should start a new process and send FETCH_REQ on each ubq_daemon. Finally, user should send END_USER_RECOVERY, it will: (1) wait for all new ubq_daemons getting ready. (2) update ublksrv_pid (3) unquiesce the request queue and expect incoming ublk_queue_rq() (4) convert ub's state to UBLK_S_DEV_LIVE Note: we can handle STOP_DEV between START_USER_RECOVERY and END_USER_RECOVERY. This is helpful to users who cannot start new process after sending START_USER_RECOVERY ctrl-cmd. Signed-off-by: ZiyangZhang Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220923153919.44078-7-ZiyangZhang@linux.alibaba.com Signed-off-by: Jens Axboe --- include/uapi/linux/ublk_cmd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 332370628757..8f88e3a29998 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -17,7 +17,8 @@ #define UBLK_CMD_STOP_DEV 0x07 #define UBLK_CMD_SET_PARAMS 0x08 #define UBLK_CMD_GET_PARAMS 0x09 - +#define UBLK_CMD_START_USER_RECOVERY 0x10 +#define UBLK_CMD_END_USER_RECOVERY 0x11 /* * IO commands, issued by ublk server, and handled by ublk driver. * -- cgit v1.2.3 From de185b56e8a62822d4e1cdb3e068b38ca709aa47 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Sep 2022 20:05:00 +0200 Subject: blk-cgroup: pass a gendisk to blkcg_schedule_throttle Pass the gendisk to blkcg_schedule_throttle as part of moving the blk-cgroup infrastructure to be gendisk based. Remove the unused !BLK_CGROUP stub while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Andreas Herrmann Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20220921180501.1539876-17-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 9f40dbc65f82..dd5841a42c33 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -18,14 +18,14 @@ struct bio; struct cgroup_subsys_state; -struct request_queue; +struct gendisk; #define FC_APPID_LEN 129 #ifdef CONFIG_BLK_CGROUP extern struct cgroup_subsys_state * const blkcg_root_css; -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay); void blkcg_maybe_throttle_current(void); bool blk_cgroup_congested(void); void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css); @@ -39,7 +39,6 @@ struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio); static inline void blkcg_maybe_throttle_current(void) { } static inline bool blk_cgroup_congested(void) { return false; } -static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } static inline struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) { return NULL; -- cgit v1.2.3 From 1c32a8012b7fabe469b6af826edfd4ae2a6201d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Sep 2022 15:38:58 +0200 Subject: nvme: improve the NVME_CONNECT_AUTHREQ* definitions Mark them as unsigned so that we don't need extra casts, and define them relative to cdword0 instead of requiring extra shifts. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ae53d74f3696..050d7d0cd81b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1482,8 +1482,8 @@ struct nvmf_connect_command { }; enum { - NVME_CONNECT_AUTHREQ_ASCR = (1 << 2), - NVME_CONNECT_AUTHREQ_ATR = (1 << 1), + NVME_CONNECT_AUTHREQ_ASCR = (1U << 18), + NVME_CONNECT_AUTHREQ_ATR = (1U << 17), }; struct nvmf_connect_data { -- cgit v1.2.3 From 568ec936bf1384fc15873908c96a9aeb62536edb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Sep 2022 09:58:15 +0200 Subject: block: replace blk_queue_nowait with bdev_nowait Replace blk_queue_nowait with a bdev_nowait helpers that takes the block_device given that the I/O submission path should not have to look into the request_queue. Signed-off-by: Christoph Hellwig Reviewed-by: Pankaj Raghav Link: https://lore.kernel.org/r/20220927075815.269694-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 84b13fdd34a7..4750772ef228 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -618,7 +618,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) -#define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); @@ -1280,6 +1279,11 @@ static inline bool bdev_fua(struct block_device *bdev) return test_bit(QUEUE_FLAG_FUA, &bdev_get_queue(bdev)->queue_flags); } +static inline bool bdev_nowait(struct block_device *bdev) +{ + return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags); +} + static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3 From 8cafdb5ab94cda3ebb0975be16e2d564a05132ea Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 29 Sep 2022 09:47:44 +0200 Subject: block: adapt blk_mq_plug() to not plug for writes that require a zone lock The current implementation of blk_mq_plug() disables plugging for all operations that involves a transfer to the device as we just check if the last bit in op_is_write() function. Modify blk_mq_plug() to disable plugging only for REQ_OP_WRITE and REQ_OP_WRITE_ZEROS as they might require a zone lock. Suggested-by: Christoph Hellwig Suggested-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Pankaj Raghav Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220929074745.103073-2-p.raghav@samsung.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4750772ef228..49373d002631 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1304,6 +1304,15 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } +static inline bool bdev_op_is_zoned_write(struct block_device *bdev, + blk_opf_t op) +{ + if (!bdev_is_zoned(bdev)) + return false; + + return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; +} + static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); -- cgit v1.2.3