From a55b70f1273a54b33482db8b2568da435fefd6c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Oct 2022 08:59:16 -0700 Subject: block: remove bio_start_io_acct_time bio_start_io_acct_time is not actually used anywhere, so remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221025155916.270303-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50e358a19d98..57ed49f20d2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1458,7 +1458,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); -void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev); -- cgit v1.2.3 From b179c98f76978a0fae072c2b2dad8e143218afd8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 25 Oct 2022 12:17:53 -0700 Subject: block: Remove request.write_hint Commit c75e707fe1aa ("block: remove the per-bio/request write hint") removed all code that uses the struct request write_hint member. Hence also remove 'write_hint' itself. Reviewed-by: Ming Lei Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20221025191755.1711437-2-bvanassche@acm.org Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ba18e9bdb799..569053ed959d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,7 +140,6 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif - unsigned short write_hint; unsigned short ioprio; enum mq_rq_state state; -- cgit v1.2.3 From adff215830fcf3ef74f2f0d4dd5a47a6927d450b Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Sun, 30 Oct 2022 13:20:08 +0800 Subject: block: simplify blksize_bits() implementation Convert current looping-based implementation into bit operation, which can bring improvement for: 1) bitops is more efficient for its arch-level optimization. 2) Given that blksize_bits() is inline, _if_ @size is compile-time constant, it's possible that order_base_2() _may_ make output compile-time evaluated, depending on code context and compiler behavior. Signed-off-by: Dawei Li Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/TYCP286MB23238842958D7C083D6B67CECA349@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 57ed49f20d2e..32137d85c9ad 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1349,12 +1349,7 @@ static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, /* assumes size > 256 */ static inline unsigned int blksize_bits(unsigned int size) { - unsigned int bits = 8; - do { - bits++; - size >>= 1; - } while (size > 256); - return bits; + return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT; } static inline unsigned int block_size(struct block_device *bdev) -- cgit v1.2.3 From 80bd4a7aab4c9ce59bf5e35fdf52aa23d8a3c9f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:47 +0100 Subject: blk-mq: move the srcu_struct used for quiescing to the tagset All I/O submissions have fairly similar latencies, and a tagset-wide quiesce is a fairly common operation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-12-hch@lst.de [axboe: fix whitespace] Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++++ include/linux/blkdev.h | 9 --------- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 569053ed959d..f059edebb11d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -7,6 +7,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -500,6 +501,8 @@ enum hctx_type { * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. + * @srcu: Use as lock when type of the request queue is blocking + * (BLK_MQ_F_BLOCKING). */ struct blk_mq_tag_set { struct blk_mq_queue_map map[HCTX_MAX_TYPES]; @@ -520,6 +523,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; + struct srcu_struct *srcu; }; /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32137d85c9ad..6a6fa167fc82 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -543,18 +542,11 @@ struct request_queue { struct mutex debugfs_mutex; bool mq_sysfs_init_done; - - /** - * @srcu: Sleepable RCU. Use as lock when type of the request queue - * is blocking (BLK_MQ_F_BLOCKING). Must be the last member - */ - struct srcu_struct srcu[]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -590,7 +582,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) -#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ -- cgit v1.2.3 From 483239c75ba768e0e2c0e0c503e5fc13c3d5773a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:48 +0100 Subject: blk-mq: pass a tagset to blk_mq_wait_quiesce_done Nothing in blk_mq_wait_quiesce_done needs the request_queue now, so just pass the tagset, and move the non-mq check into the only caller that needs it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-13-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f059edebb11d..061ea6e7af01 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -880,7 +880,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); -void blk_mq_wait_quiesce_done(struct request_queue *q); +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -- cgit v1.2.3 From 414dd48e882c5a39e7bd01b096ee6497eb3314b0 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Tue, 1 Nov 2022 16:00:49 +0100 Subject: blk-mq: add tagset quiesce interface Drivers that have shared tagsets may need to quiesce potentially a lot of request queues that all share a single tagset (e.g. nvme). Add an interface to quiesce all the queues on a given tagset. This interface is useful because it can speedup the quiesce by doing it in parallel. Because some queues should not need to be quiesced (e.g. the nvme connect_q) when quiescing the tagset, introduce a QUEUE_FLAG_SKIP_TAGSET_QUIESCE flag to allow this new interface to ski quiescing a particular queue. Signed-off-by: Chao Leng [hch: simplify for the per-tag_set srcu_struct] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 ++ include/linux/blkdev.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 061ea6e7af01..109a0e30c470 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -881,6 +881,8 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a6fa167fc82..9188aa3f6259 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,7 @@ struct request_queue { #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ #define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ (1UL << QUEUE_FLAG_SAME_COMP) | \ @@ -610,6 +611,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) +#define blk_queue_skip_tagset_quiesce(q) \ + test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From 0f0892356fa174bdd8bd655c820ee3658c4c9f01 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:08 -0600 Subject: mm: allow multiple error returns in try_grab_page() In order to add checks for P2PDMA memory into try_grab_page(), expand the error return from a bool to an int/error code. Update all the callsites handle change in usage. Also remove the WARN_ON_ONCE() call at the callsites seeing there already is a WARN_ON_ONCE() inside the function if it fails. Signed-off-by: Logan Gunthorpe Reviewed-by: Dan Williams Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221021174116.7200-2-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc5565..62a91dc1272b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1129,7 +1129,7 @@ static inline void get_page(struct page *page) folio_get(page_folio(page)); } -bool __must_check try_grab_page(struct page *page, unsigned int flags); +int __must_check try_grab_page(struct page *page, unsigned int flags); static inline __must_check bool try_get_page(struct page *page) { -- cgit v1.2.3 From 4003f107fa2eabb0aab90e37a1ed7b74c6f0d132 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:09 -0600 Subject: mm: introduce FOLL_PCI_P2PDMA to gate getting PCI P2PDMA pages GUP Callers that expect PCI P2PDMA pages can now set FOLL_PCI_P2PDMA to allow obtaining P2PDMA pages. If GUP is called without the flag and a P2PDMA page is found, it will return an error in try_grab_page() or try_grab_folio(). The check is safe to do before taking the reference to the page in both cases seeing the page should be protected by either the appropriate ptl or mmap_lock; or the gup fast guarantees preventing TLB flushes. try_grab_folio() has one call site that WARNs on failure and cannot actually deal with the failure of this function (it seems it will get into an infinite loop). Expand the comment there to document a couple more conditions on why it will not fail. FOLL_PCI_P2PDMA cannot be set if FOLL_LONGTERM is set. This is to copy fsdax until pgmap refcounts are fixed (see the link below for more information). Link: https://lkml.kernel.org/r/Yy4Ot5MoOhsgYLTQ@ziepe.ca Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-3-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 62a91dc1272b..6b081a8dcf88 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2958,6 +2958,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ #define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each -- cgit v1.2.3 From d82076403cef7fcd1e7617c9db48bf21ebdc1f9c Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:10 -0600 Subject: iov_iter: introduce iov_iter_get_pages_[alloc_]flags() Add iov_iter_get_pages_flags() and iov_iter_get_pages_alloc_flags() which take a flags argument that is passed to get_user_pages_fast(). This is so that FOLL_PCI_P2PDMA can be passed when appropriate. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221021174116.7200-4-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 2e3134b14ffd..9ede533ce64c 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -247,8 +247,14 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); +ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, + size_t maxsize, unsigned maxpages, size_t *start, + unsigned gup_flags); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); +ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, size_t *start, + unsigned gup_flags); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); -- cgit v1.2.3 From 49580e690755d0e51ed7aa2c33225dd884fa738a Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:11 -0600 Subject: block: add check when merging zone device pages Consecutive zone device pages should not be merged into the same sgl or bvec segment with other types of pages or if they belong to different pgmaps. Otherwise getting the pgmap of a given segment is not possible without scanning the entire segment. This helper returns true either if both pages are not zone device pages or both pages are zone device pages with the same pgmap. Add a helper to determine if zone device pages are mergeable and use this helper in page_is_mergeable(). Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-5-logang@deltatee.com Signed-off-by: Jens Axboe --- include/linux/mmzone.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f74891556f3..9c49ec5d0e25 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -986,6 +986,25 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } + +/* + * Consecutive zone device pages should not be merged into the same sgl + * or bvec segment with other types of pages or if they belong to different + * pgmaps. Otherwise getting the pgmap of a given segment is not possible + * without scanning the entire segment. This helper returns true either if + * both pages are not zone device pages or both pages are zone device pages + * with the same pgmap. + */ +static inline bool zone_device_pages_have_same_pgmap(const struct page *a, + const struct page *b) +{ + if (is_zone_device_page(a) != is_zone_device_page(b)) + return false; + if (!is_zone_device_page(a)) + return true; + return a->pgmap == b->pgmap; +} + extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else @@ -993,6 +1012,11 @@ static inline bool is_zone_device_page(const struct page *page) { return false; } +static inline bool zone_device_pages_have_same_pgmap(const struct page *a, + const struct page *b) +{ + return true; +} #endif static inline bool folio_is_zone_device(const struct folio *folio) -- cgit v1.2.3 From 4f8126bb2308066b877859e4b5923ffb54143630 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Sat, 5 Nov 2022 19:10:55 -0400 Subject: sbitmap: Use single per-bitmap counting to wake up queued tags sbitmap suffers from code complexity, as demonstrated by recent fixes, and eventual lost wake ups on nested I/O completion. The later happens, from what I understand, due to the non-atomic nature of the updates to wait_cnt, which needs to be subtracted and eventually reset when equal to zero. This two step process can eventually miss an update when a nested completion happens to interrupt the CPU in between the wait_cnt updates. This is very hard to fix, as shown by the recent changes to this code. The code complexity arises mostly from the corner cases to avoid missed wakes in this scenario. In addition, the handling of wake_batch recalculation plus the synchronization with sbq_queue_wake_up is non-trivial. This patchset implements the idea originally proposed by Jan [1], which removes the need for the two-step updates of wait_cnt. This is done by tracking the number of completions and wakeups in always increasing, per-bitmap counters. Instead of having to reset the wait_cnt when it reaches zero, we simply keep counting, and attempt to wake up N threads in a single wait queue whenever there is enough space for a batch. Waking up less than batch_wake shouldn't be a problem, because we haven't changed the conditions for wake up, and the existing batch calculation guarantees at least enough remaining completions to wake up a batch for each queue at any time. Performance-wise, one should expect very similar performance to the original algorithm for the case where there is no queueing. In both the old algorithm and this implementation, the first thing is to check ws_active, which bails out if there is no queueing to be managed. In the new code, we took care to avoid accounting completions and wakeups when there is no queueing, to not pay the cost of atomic operations unnecessarily, since it doesn't skew the numbers. For more interesting cases, where there is queueing, we need to take into account the cross-communication of the atomic operations. I've been benchmarking by running parallel fio jobs against a single hctx nullb in different hardware queue depth scenarios, and verifying both IOPS and queueing. Each experiment was repeated 5 times on a 20-CPU box, with 20 parallel jobs. fio was issuing fixed-size randwrites with qd=64 against nullb, varying only the hardware queue length per test. queue size 2 4 8 16 32 64 6.1-rc2 1681.1K (1.6K) 2633.0K (12.7K) 6940.8K (16.3K) 8172.3K (617.5K) 8391.7K (367.1K) 8606.1K (351.2K) patched 1721.8K (15.1K) 3016.7K (3.8K) 7543.0K (89.4K) 8132.5K (303.4K) 8324.2K (230.6K) 8401.8K (284.7K) The following is a similar experiment, ran against a nullb with a single bitmap shared by 20 hctx spread across 2 NUMA nodes. This has 40 parallel fio jobs operating on the same device queue size 2 4 8 16 32 64 6.1-rc2 1081.0K (2.3K) 957.2K (1.5K) 1699.1K (5.7K) 6178.2K (124.6K) 12227.9K (37.7K) 13286.6K (92.9K) patched 1081.8K (2.8K) 1316.5K (5.4K) 2364.4K (1.8K) 6151.4K (20.0K) 11893.6K (17.5K) 12385.6K (18.4K) It has also survived blktests and a 12h-stress run against nullb. I also ran the code against nvme and a scsi SSD, and I didn't observe performance regression in those. If there are other tests you think I should run, please let me know and I will follow up with results. [1] https://lore.kernel.org/all/aef9de29-e9f5-259a-f8be-12d1b734e72@google.com/ Cc: Hugh Dickins Cc: Keith Busch Cc: Liu Song Suggested-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20221105231055.25953-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 4d2d5205ab58..d662cf136021 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -86,11 +86,6 @@ struct sbitmap { * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { - /** - * @wait_cnt: Number of frees remaining before we wake up. - */ - atomic_t wait_cnt; - /** * @wait: Wait queue. */ @@ -138,6 +133,17 @@ struct sbitmap_queue { * sbitmap_queue_get_shallow() */ unsigned int min_shallow_depth; + + /** + * @completion_cnt: Number of bits cleared passed to the + * wakeup function. + */ + atomic_t completion_cnt; + + /** + * @wakeup_cnt: Number of thread wake ups issued. + */ + atomic_t wakeup_cnt; }; /** -- cgit v1.2.3 From 42271ca389edb0446b9e492858b4c38083b0b9f8 Mon Sep 17 00:00:00 2001 From: Giulio Benetti Date: Wed, 19 Oct 2022 18:04:07 +0200 Subject: lib/raid6: drop RAID6_USE_EMPTY_ZERO_PAGE RAID6_USE_EMPTY_ZERO_PAGE is unused and hardcoded to 0, so let's drop it. Signed-off-by: Giulio Benetti Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- include/linux/raid/pq.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index d6e5a1feb947..f29aaaf2eb21 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -10,17 +10,9 @@ #ifdef __KERNEL__ -/* Set to 1 to use kernel-wide empty_zero_page */ -#define RAID6_USE_EMPTY_ZERO_PAGE 0 #include -/* We need a pre-zeroed page... if we don't want to use the kernel-provided - one define it here */ -#if RAID6_USE_EMPTY_ZERO_PAGE -# define raid6_empty_zero_page empty_zero_page -#else extern const char raid6_empty_zero_page[PAGE_SIZE]; -#endif #else /* ! __KERNEL__ */ /* Used for testing in user space */ -- cgit v1.2.3 From 855b7717f44b13e0990aa5ad36bbf9aa35051516 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Mon, 31 Oct 2022 21:53:50 +0530 Subject: nvme: fine-granular CAP_SYS_ADMIN for nvme io commands Currently both io and admin commands are kept under a coarse-granular CAP_SYS_ADMIN check, disregarding file mode completely. $ ls -l /dev/ng* crw-rw-rw- 1 root root 242, 0 Sep 9 19:20 /dev/ng0n1 crw------- 1 root root 242, 1 Sep 9 19:20 /dev/ng0n2 In the example above, ng0n1 appears as if it may allow unprivileged read/write operation but it does not and behaves same as ng0n2. This patch implements a shift from CAP_SYS_ADMIN to more fine-granular control for io-commands. If CAP_SYS_ADMIN is present, nothing else is checked as before. Otherwise, following rules are in place - any admin-cmd is not allowed - vendor-specific and fabric commmand are not allowed - io-commands that can write are allowed if matching FMODE_WRITE permission is present - io-commands that read are allowed Add a helper nvme_cmd_allowed that implements above policy. Change all the callers of CAP_SYS_ADMIN to go through nvme_cmd_allowed for any decision making. Since file open mode is counted for any approval/denial, change at various places to keep file-mode information handy. Signed-off-by: Kanchan Joshi Reviewed-by: Jens Axboe Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 050d7d0cd81b..1d102b662e88 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -797,6 +797,7 @@ enum nvme_opcode { nvme_cmd_zone_mgmt_send = 0x79, nvme_cmd_zone_mgmt_recv = 0x7a, nvme_cmd_zone_append = 0x7d, + nvme_cmd_vendor_start = 0x80, }; #define nvme_opcode_name(opcode) { opcode, #opcode } -- cgit v1.2.3 From 1b96f862ecccb3e6f950eba584bebf22955cecc5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 16:50:15 +0100 Subject: nvme: implement the DEAC bit for the Write Zeroes command While the specification allows devices to either deallocate data or to actually write zeroes on any Write Zeroes command, many SSDs only do the sensible thing and deallocate data when the DEAC bit is specific. Set it when it is supported and the caller doesn't explicitly opt out of deallocation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1d102b662e88..d6be2a686100 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -964,6 +964,7 @@ enum { NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, NVME_RW_PRINFO_PRACT = 1 << 13, NVME_RW_DTYPE_STREAMS = 1 << 4, + NVME_WZ_DEAC = 1 << 9, }; struct nvme_dsm_cmd { -- cgit v1.2.3 From 6e4068a11413b96687a03c39814539e202de294b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Nov 2022 15:18:19 +0000 Subject: mempool: introduce mempool_is_saturated Introduce a helper mempool_is_saturated(), which tells if the mempool is under-filled or not. We need it to figure out whether it should be freed right into the mempool or could be cached with top level caches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/636aed30be8c35d78f45e244998bc6209283cccc.1667384020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/mempool.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 0c964ac107c2..4aae6c06c5f2 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -30,6 +30,11 @@ static inline bool mempool_initialized(mempool_t *pool) return pool->elements != NULL; } +static inline bool mempool_is_saturated(mempool_t *pool) +{ + return READ_ONCE(pool->curr_nr) >= pool->min_nr; +} + void mempool_exit(mempool_t *pool); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, -- cgit v1.2.3 From ee7dc86b6d3e3b86c2c487f713eda657850de238 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 15 Nov 2022 17:45:52 -0500 Subject: wait: Return number of exclusive waiters awaken Sbitmap code will need to know how many waiters were actually woken for its batched wakeups implementation. Return the number of woken exclusive waiters from __wake_up() to facilitate that. Suggested-by: Jan Kara Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221115224553.23594-3-krisman@suse.de Signed-off-by: Jens Axboe --- include/linux/wait.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 7f5a51aae0a7..a0307b516b09 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -209,7 +209,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq list_del(&wq_entry->entry); } -void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); -- cgit v1.2.3 From 7abc077788363ac7194aefd355306f8e974feff7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 22:10:51 +0800 Subject: block: remove delayed holder registration Now that dm has been fixed to track of holder registrations before add_disk, the somewhat buggy block layer code can be safely removed. Signed-off-by: Christoph Hellwig Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20221115141054.1051801-8-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9188aa3f6259..516e45246868 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -833,7 +833,6 @@ void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); -int bd_register_pending_holders(struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -844,10 +843,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } -static inline int bd_register_pending_holders(struct gendisk *disk) -{ - return 0; -} #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); -- cgit v1.2.3 From dae590a6c96c799434e0ff8156ef29b88c257e60 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:02 -0400 Subject: blk-cgroup: Flush stats at blkgs destruction path As noted by Michal, the blkg_iostat_set's in the lockless list hold reference to blkg's to protect against their removal. Those blkg's hold reference to blkcg. When a cgroup is being destroyed, cgroup_rstat_flush() is only called at css_release_work_fn() which is called when the blkcg reference count reaches 0. This circular dependency will prevent blkcg from being freed until some other events cause cgroup_rstat_flush() to be called to flush out the pending blkcg stats. To prevent this delayed blkcg removal, add a new cgroup_rstat_css_flush() function to flush stats for a given css and cpu and call it at the blkgs destruction path, blkcg_destroy_blkgs(), whenever there are still some pending stats to be flushed. This will ensure that blkcg reference count can reach 0 ASAP. Signed-off-by: Waiman Long Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-4-longman@redhat.com Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44b59e2..6c4e66b3fa84 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,6 +766,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. -- cgit v1.2.3 From fce3caea0f241f5d34855c82c399d5e0e2d91f07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:42 +0100 Subject: blk-crypto: don't use struct request_queue for public interfaces Switch all public blk-crypto interfaces to use struct block_device arguments to specify the device they operate on instead of th request_queue, which is a block layer implementation detail. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-crypto.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 69b24fe92cbf..561ca92e204d 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -71,9 +71,6 @@ struct bio_crypt_ctx { #include #include -struct request; -struct request_queue; - #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) @@ -94,13 +91,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, unsigned int dun_bytes, unsigned int data_unit_size); -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q); +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key); -int blk_crypto_evict_key(struct request_queue *q, +int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -- cgit v1.2.3 From 6715c98b6cf003f26b1b2f655393134e9d999a05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:43 +0100 Subject: blk-crypto: add a blk_crypto_config_supported_natively helper Add a blk_crypto_config_supported_natively helper that wraps __blk_crypto_cfg_supported to retrieve the crypto_profile from the request queue. With this fscrypt can stop including blk-crypto-profile.h and rely on the public consumer interface in blk-crypto.h. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-crypto.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 561ca92e204d..a33d32f5c268 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -97,6 +97,8 @@ int blk_crypto_start_using_key(struct block_device *bdev, int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg); bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); -- cgit v1.2.3 From 3569788c08235c6f3e9e6ca724b2df44787ff487 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:44 +0100 Subject: blk-crypto: move internal only declarations to blk-crypto-internal.h blk_crypto_get_keyslot, blk_crypto_put_keyslot, __blk_crypto_evict_key and __blk_crypto_cfg_supported are only used internally by the blk-crypto code, so move the out of blk-crypto-profile.h, which is included by drivers that supply blk-crypto functionality. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-crypto-profile.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index bbab65bd5428..e6802b69cdd6 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -138,18 +138,6 @@ int devm_blk_crypto_profile_init(struct device *dev, unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot); -blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, - const struct blk_crypto_key *key, - struct blk_crypto_keyslot **slot_ptr); - -void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); - -bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, - const struct blk_crypto_config *cfg); - -int __blk_crypto_evict_key(struct blk_crypto_profile *profile, - const struct blk_crypto_key *key); - void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); -- cgit v1.2.3 From 2cd10a496a86787367716b684dadfecbb594095b Mon Sep 17 00:00:00 2001 From: Joel Colledge Date: Tue, 22 Nov 2022 14:43:00 +0100 Subject: lru_cache: remove unused lc_private, lc_set, lc_index_of MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Joel Colledge Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20221122134301.69258-4-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- include/linux/lru_cache.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 07add7882a5d..c9afcdd9324c 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -199,7 +199,6 @@ struct lru_cache { unsigned long flags; - void *lc_private; const char *name; /* nr_elements there */ @@ -241,7 +240,6 @@ extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, unsigned e_count, size_t e_size, size_t e_off); extern void lc_reset(struct lru_cache *lc); extern void lc_destroy(struct lru_cache *lc); -extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); extern void lc_del(struct lru_cache *lc, struct lc_element *element); extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr); @@ -297,6 +295,5 @@ extern bool lc_is_used(struct lru_cache *lc, unsigned int enr); container_of(ptr, type, member) extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); -extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); #endif -- cgit v1.2.3 From c62256dda37133a48d56cecc15e4a4d527d4cc46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Nov 2022 08:25:46 -0700 Subject: Revert "blk-cgroup: Flush stats at blkgs destruction path" This reverts commit dae590a6c96c799434e0ff8156ef29b88c257e60. We've had a few reports on this causing a crash at boot time, because of a reference issue. While this problem seemginly did exist before the patch and needs solving separately, this patch makes it a lot easier to trigger. Link: https://lore.kernel.org/linux-block/CA+QYu4oxiRKC6hJ7F27whXy-PRBx=Tvb+-7TQTONN8qTtV3aDA@mail.gmail.com/ Link: https://lore.kernel.org/linux-block/69af7ccb-6901-c84c-0e95-5682ccfb750c@acm.org/ Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6c4e66b3fa84..528bd44b59e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,7 +766,6 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); -void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. -- cgit v1.2.3 From 2bd85221a625b316114bafaab527770b607095d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:36 +0100 Subject: block: untangle request_queue refcounting from sysfs The kobject embedded into the request_queue is used for the queue directory in sysfs, but that is a child of the gendisks directory and is intimately tied to it. Move this kobject to the gendisk and use a refcount_t in the request_queue for the actual request_queue refcounting that is completely unrelated to the device model. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221114042637.1009333-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 516e45246868..469299ea0660 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -155,6 +155,7 @@ struct gendisk { unsigned open_partitions; /* number of open partitions */ struct backing_dev_info *bdi; + struct kobject queue_kobj; /* the queue/ directory */ struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; @@ -430,10 +431,7 @@ struct request_queue { struct gendisk *disk; - /* - * queue kobject - */ - struct kobject kobj; + refcount_t refs; /* * mq queue kobject -- cgit v1.2.3 From 63c9eac4b6d75859703f5820414986edefb01210 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 20:19:30 +0800 Subject: blk-iocost: Trace vtime_base_rate instead of vtime_rate Since commit ac33e91e2daca ("blk-iocost: implement vtime loss compensation") rename original vtime_rate to vtime_base_rate and current vtime_rate is original vtime_rate with compensation. The current rate showed in tracepoint is mixed with vtime_rate and vtime_base_rate: 1) In function ioc_adjust_base_vrate, the first trace_iocost_ioc_vrate_adj shows vtime_rate, the second trace_iocost_ioc_vrate_adj shows vtime_base_rate. 2) In function iocg_activate shows vtime_rate by calling TRACE_IOCG_PATH(iocg_activate... 3) In function ioc_check_iocgs shows vtime_rate by calling TRACE_IOCG_PATH(iocg_idle... Trace vtime_base_rate instead of vtime_rate as: 1) Before commit ac33e91e2daca ("blk-iocost: implement vtime loss compensation"), the traced rate is without compensation, so still show rate without compensation. 2) The vtime_base_rate is more stable while vtime_rate heavily depends on excess budeget on current period which may change abruptly in next period. Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221018121932.10792-4-shikemeng@huawei.com Signed-off-by: Jens Axboe --- include/trace/events/iocost.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index 6d1626e7a4ce..af8bfed528fc 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(iocost_iocg_state, __assign_str(cgroup, path); __entry->now = now->now; __entry->vnow = now->vnow; - __entry->vrate = now->vrate; + __entry->vrate = iocg->ioc->vtime_base_rate; __entry->last_period = last_period; __entry->cur_period = cur_period; __entry->vtime = vtime; @@ -160,7 +160,7 @@ TRACE_EVENT(iocost_ioc_vrate_adj, TP_fast_assign( __assign_str(devname, ioc_name(ioc)); - __entry->old_vrate = atomic64_read(&ioc->vtime_rate); + __entry->old_vrate = ioc->vtime_base_rate; __entry->new_vrate = new_vrate; __entry->busy_level = ioc->busy_level; __entry->read_missed_ppm = missed_ppm[READ]; -- cgit v1.2.3 From f40eb99897af665f11858dd7b56edcb62c3f3c67 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 Dec 2022 19:27:58 +0100 Subject: pktcdvd: remove driver. Way back in 2016 in commit 5a8b187c61e9 ("pktcdvd: mark as unmaintained and deprecated") this driver was marked as "will be removed soon". 5 years seems long enough to have it stick around after that, so finally remove the thing now. Reported-by: Christoph Hellwig Cc: Jens Axboe Cc: Thomas Maier Cc: Peter Osterlund Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221202182758.1339039-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- include/linux/pktcdvd.h | 197 ------------------------------------------- include/uapi/linux/pktcdvd.h | 112 ------------------------ 2 files changed, 309 deletions(-) delete mode 100644 include/linux/pktcdvd.h delete mode 100644 include/uapi/linux/pktcdvd.h (limited to 'include') diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h deleted file mode 100644 index f9c5ac80d59b..000000000000 --- a/include/linux/pktcdvd.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (C) 2000 Jens Axboe - * Copyright (C) 2001-2004 Peter Osterlund - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and - * DVD-RW devices. - * - */ -#ifndef __PKTCDVD_H -#define __PKTCDVD_H - -#include -#include -#include -#include -#include -#include -#include - -/* default bio write queue congestion marks */ -#define PKT_WRITE_CONGESTION_ON 10000 -#define PKT_WRITE_CONGESTION_OFF 9000 - - -struct packet_settings -{ - __u32 size; /* packet size in (512 byte) sectors */ - __u8 fp; /* fixed packets */ - __u8 link_loss; /* the rest is specified - * as per Mt Fuji */ - __u8 write_type; - __u8 track_mode; - __u8 block_mode; -}; - -/* - * Very crude stats for now - */ -struct packet_stats -{ - unsigned long pkt_started; - unsigned long pkt_ended; - unsigned long secs_w; - unsigned long secs_rg; - unsigned long secs_r; -}; - -struct packet_cdrw -{ - struct list_head pkt_free_list; - struct list_head pkt_active_list; - spinlock_t active_list_lock; /* Serialize access to pkt_active_list */ - struct task_struct *thread; - atomic_t pending_bios; -}; - -/* - * Switch to high speed reading after reading this many kilobytes - * with no interspersed writes. - */ -#define HI_SPEED_SWITCH 512 - -struct packet_iosched -{ - atomic_t attention; /* Set to non-zero when queue processing is needed */ - int writing; /* Non-zero when writing, zero when reading */ - spinlock_t lock; /* Protecting read/write queue manipulations */ - struct bio_list read_queue; - struct bio_list write_queue; - sector_t last_write; /* The sector where the last write ended */ - int successive_reads; -}; - -/* - * 32 buffers of 2048 bytes - */ -#if (PAGE_SIZE % CD_FRAMESIZE) != 0 -#error "PAGE_SIZE must be a multiple of CD_FRAMESIZE" -#endif -#define PACKET_MAX_SIZE 128 -#define FRAMES_PER_PAGE (PAGE_SIZE / CD_FRAMESIZE) -#define PACKET_MAX_SECTORS (PACKET_MAX_SIZE * CD_FRAMESIZE >> 9) - -enum packet_data_state { - PACKET_IDLE_STATE, /* Not used at the moment */ - PACKET_WAITING_STATE, /* Waiting for more bios to arrive, so */ - /* we don't have to do as much */ - /* data gathering */ - PACKET_READ_WAIT_STATE, /* Waiting for reads to fill in holes */ - PACKET_WRITE_WAIT_STATE, /* Waiting for the write to complete */ - PACKET_RECOVERY_STATE, /* Recover after read/write errors */ - PACKET_FINISHED_STATE, /* After write has finished */ - - PACKET_NUM_STATES /* Number of possible states */ -}; - -/* - * Information needed for writing a single packet - */ -struct pktcdvd_device; - -struct packet_data -{ - struct list_head list; - - spinlock_t lock; /* Lock protecting state transitions and */ - /* orig_bios list */ - - struct bio_list orig_bios; /* Original bios passed to pkt_make_request */ - /* that will be handled by this packet */ - int write_size; /* Total size of all bios in the orig_bios */ - /* list, measured in number of frames */ - - struct bio *w_bio; /* The bio we will send to the real CD */ - /* device once we have all data for the */ - /* packet we are going to write */ - sector_t sector; /* First sector in this packet */ - int frames; /* Number of frames in this packet */ - - enum packet_data_state state; /* Current state */ - atomic_t run_sm; /* Incremented whenever the state */ - /* machine needs to be run */ - long sleep_time; /* Set this to non-zero to make the state */ - /* machine run after this many jiffies. */ - - atomic_t io_wait; /* Number of pending IO operations */ - atomic_t io_errors; /* Number of read/write errors during IO */ - - struct bio *r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */ - struct page *pages[PACKET_MAX_SIZE / FRAMES_PER_PAGE]; - - int cache_valid; /* If non-zero, the data for the zone defined */ - /* by the sector variable is completely cached */ - /* in the pages[] vector. */ - - int id; /* ID number for debugging */ - struct pktcdvd_device *pd; -}; - -struct pkt_rb_node { - struct rb_node rb_node; - struct bio *bio; -}; - -struct packet_stacked_data -{ - struct bio *bio; /* Original read request bio */ - struct pktcdvd_device *pd; -}; -#define PSD_POOL_SIZE 64 - -struct pktcdvd_device -{ - struct block_device *bdev; /* dev attached */ - dev_t pkt_dev; /* our dev */ - char name[20]; - struct packet_settings settings; - struct packet_stats stats; - int refcnt; /* Open count */ - int write_speed; /* current write speed, kB/s */ - int read_speed; /* current read speed, kB/s */ - unsigned long offset; /* start offset */ - __u8 mode_offset; /* 0 / 8 */ - __u8 type; - unsigned long flags; - __u16 mmc3_profile; - __u32 nwa; /* next writable address */ - __u32 lra; /* last recorded address */ - struct packet_cdrw cdrw; - wait_queue_head_t wqueue; - - spinlock_t lock; /* Serialize access to bio_queue */ - struct rb_root bio_queue; /* Work queue of bios we need to handle */ - int bio_queue_size; /* Number of nodes in bio_queue */ - bool congested; /* Someone is waiting for bio_queue_size - * to drop. */ - sector_t current_sector; /* Keep track of where the elevator is */ - atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */ - /* needs to be run. */ - mempool_t rb_pool; /* mempool for pkt_rb_node allocations */ - - struct packet_iosched iosched; - struct gendisk *disk; - - int write_congestion_off; - int write_congestion_on; - - struct device *dev; /* sysfs pktcdvd[0-7] dev */ - - struct dentry *dfs_d_root; /* debugfs: devname directory */ - struct dentry *dfs_f_info; /* debugfs: info file */ -}; - -#endif /* __PKTCDVD_H */ diff --git a/include/uapi/linux/pktcdvd.h b/include/uapi/linux/pktcdvd.h deleted file mode 100644 index 9cbb55d21c94..000000000000 --- a/include/uapi/linux/pktcdvd.h +++ /dev/null @@ -1,112 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2000 Jens Axboe - * Copyright (C) 2001-2004 Peter Osterlund - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and - * DVD-RW devices. - * - */ -#ifndef _UAPI__PKTCDVD_H -#define _UAPI__PKTCDVD_H - -#include - -/* - * 1 for normal debug messages, 2 is very verbose. 0 to turn it off. - */ -#define PACKET_DEBUG 1 - -#define MAX_WRITERS 8 - -#define PKT_RB_POOL_SIZE 512 - -/* - * How long we should hold a non-full packet before starting data gathering. - */ -#define PACKET_WAIT_TIME (HZ * 5 / 1000) - -/* - * use drive write caching -- we need deferred error handling to be - * able to successfully recover with this option (drive will return good - * status as soon as the cdb is validated). - */ -#if defined(CONFIG_CDROM_PKTCDVD_WCACHE) -#define USE_WCACHING 1 -#else -#define USE_WCACHING 0 -#endif - -/* - * No user-servicable parts beyond this point -> - */ - -/* - * device types - */ -#define PACKET_CDR 1 -#define PACKET_CDRW 2 -#define PACKET_DVDR 3 -#define PACKET_DVDRW 4 - -/* - * flags - */ -#define PACKET_WRITABLE 1 /* pd is writable */ -#define PACKET_NWA_VALID 2 /* next writable address valid */ -#define PACKET_LRA_VALID 3 /* last recorded address valid */ -#define PACKET_MERGE_SEGS 4 /* perform segment merging to keep */ - /* underlying cdrom device happy */ - -/* - * Disc status -- from READ_DISC_INFO - */ -#define PACKET_DISC_EMPTY 0 -#define PACKET_DISC_INCOMPLETE 1 -#define PACKET_DISC_COMPLETE 2 -#define PACKET_DISC_OTHER 3 - -/* - * write type, and corresponding data block type - */ -#define PACKET_MODE1 1 -#define PACKET_MODE2 2 -#define PACKET_BLOCK_MODE1 8 -#define PACKET_BLOCK_MODE2 10 - -/* - * Last session/border status - */ -#define PACKET_SESSION_EMPTY 0 -#define PACKET_SESSION_INCOMPLETE 1 -#define PACKET_SESSION_RESERVED 2 -#define PACKET_SESSION_COMPLETE 3 - -#define PACKET_MCN "4a656e734178626f65323030300000" - -#undef PACKET_USE_LS - -#define PKT_CTRL_CMD_SETUP 0 -#define PKT_CTRL_CMD_TEARDOWN 1 -#define PKT_CTRL_CMD_STATUS 2 - -struct pkt_ctrl_command { - __u32 command; /* in: Setup, teardown, status */ - __u32 dev_index; /* in/out: Device index */ - __u32 dev; /* in/out: Device nr for cdrw device */ - __u32 pkt_dev; /* in/out: Device nr for packet device */ - __u32 num_devices; /* out: Largest device index + 1 */ - __u32 padding; /* Not used */ -}; - -/* - * packet ioctls - */ -#define PACKET_IOCTL_MAGIC ('X') -#define PACKET_CTRL_CMD _IOWR(PACKET_IOCTL_MAGIC, 1, struct pkt_ctrl_command) - - -#endif /* _UAPI__PKTCDVD_H */ -- cgit v1.2.3 From 85d6ce58e493ac8b7122e2fbe3f41b94d6ebdc11 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 3 Dec 2022 15:07:47 +0100 Subject: block: remove devnode callback from struct block_device_operations With the removal of the pktcdvd driver, there are no in-kernel users of the devnode callback in struct block_device_operations, so it can be safely removed. If it is needed for new block drivers in the future, it can be brought back. Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221203140747.1942969-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 469299ea0660..2db2ad72af0f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1395,7 +1395,6 @@ struct block_device_operations { void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); - char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); -- cgit v1.2.3 From db1c7d77976775483a8ef240b4c705f113e13ea1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Dec 2022 15:44:07 +0100 Subject: block: bio_copy_data_iter With the pktcdvdv removal, bio_copy_data_iter is unused now. Fold the logic into bio_copy_data and remove the separate lower level function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221206144407.722049-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 2c5806997bbf..b231a665682a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,8 +475,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, - struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -- cgit v1.2.3 From c34b7ac65087554627f4840f4ecd6f2107a68fd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Dec 2022 15:40:57 +0100 Subject: block: remove bio_set_op_attrs This macro is obsolete, so replace the last few uses with open coded bi_opf assignments. Signed-off-by: Christoph Hellwig Acked-by: Coly Li > Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20221206144057.720846-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e0b098089ef2..99be590f952f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -472,13 +472,6 @@ static inline enum req_op bio_op(const struct bio *bio) return bio->bi_opf & REQ_OP_MASK; } -/* obsolete, don't use in new code */ -static inline void bio_set_op_attrs(struct bio *bio, enum req_op op, - blk_opf_t op_flags) -{ - bio->bi_opf = op | op_flags; -} - static inline bool op_is_write(blk_opf_t op) { return !!(op & (__force blk_opf_t)1); -- cgit v1.2.3 From c1f480b2d092960ecf8bb0bd1f27982c33ada42a Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 6 Dec 2022 09:29:13 +0000 Subject: sed-opal: allow using IOC_OPAL_SAVE for locking too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Usually when closing a crypto device (eg: dm-crypt with LUKS) the volume key is not required, as it requires root privileges anyway, and root can deny access to a disk in many ways regardless. Requiring the volume key to lock the device is a peculiarity of the OPAL specification. Given we might already have saved the key if the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use that key to lock the device if no key was provided here and the locking range matches, and the user sets the appropriate flag with 'IOC_OPAL_SAVE'. This allows integrating OPAL with tools and libraries that are used to the common behaviour and do not ask for the volume key when closing a device. Callers can always pass a non-zero key and it will be used regardless, as before. Suggested-by: Štěpán Horáček Signed-off-by: Luca Boccassi Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20221206092913.4625-1-luca.boccassi@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 2573772e2fb3..1fed3c9294fc 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -44,6 +44,11 @@ enum opal_lock_state { OPAL_LK = 0x04, /* 0100 */ }; +enum opal_lock_flags { + /* IOC_OPAL_SAVE will also store the provided key for locking */ + OPAL_SAVE_FOR_LOCK = 0x01, +}; + struct opal_key { __u8 lr; __u8 key_len; @@ -76,7 +81,8 @@ struct opal_user_lr_setup { struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; - __u8 __align[4]; + __u16 flags; + __u8 __align[2]; }; struct opal_new_pw { -- cgit v1.2.3 From 56fb8d90031f71fa8af48fdff8498b9263b9c759 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 5 Dec 2022 20:16:48 +0100 Subject: block: sed-opal: Don't include There is no need to include here. Prefer the less invasive and which are needed in this .h file itself. Signed-off-by: Christophe JAILLET Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/c1d479b39e30fe70c4579a1af035d4db49421f56.1670069909.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 6f837bb6c715..31ac562a17d7 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -11,7 +11,8 @@ #define LINUX_OPAL_H #include -#include +#include +#include struct opal_dev; -- cgit v1.2.3