From 926597ffce0e3e2f785475df18e1636194209910 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:39:11 +0100 Subject: block: move disk_{block,unblock,flush}_events to blk.h No need to have these declarations in a public header. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220124093913.742411-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6906a45bc761..504f9a6674ac 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -185,9 +185,6 @@ static inline int bdev_read_only(struct block_device *bdev) return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } -extern void disk_block_events(struct gendisk *disk); -extern void disk_unblock_events(struct gendisk *disk); -extern void disk_flush_events(struct gendisk *disk, unsigned int mask); bool set_capacity_and_notify(struct gendisk *disk, sector_t size); bool disk_force_media_change(struct gendisk *disk, unsigned int events); -- cgit v1.2.3 From e7243285c0fc87054990fcde630583586ff8ed5f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:39:12 +0100 Subject: block: move blk_drop_partitions to blk.h No need to have this declaration in a public header. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220124093913.742411-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 504f9a6674ac..aa4bd985dbe5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -219,7 +219,6 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) } int bdev_disk_changed(struct gendisk *disk, bool invalidate); -void blk_drop_partitions(struct gendisk *disk); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); -- cgit v1.2.3 From 322cbb50de711814c42fb088f6d31901502c711a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:39:13 +0100 Subject: block: remove genhd.h There is no good reason to keep genhd.h separate from the main blkdev.h header that includes it. So fold the contents of genhd.h into blkdev.h and remove genhd.h entirely. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220124093913.742411-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 273 ++++++++++++++++++++++++++++++++++++++++++- include/linux/genhd.h | 287 ---------------------------------------------- include/linux/part_stat.h | 2 +- 3 files changed, 271 insertions(+), 291 deletions(-) delete mode 100644 include/linux/genhd.h (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f35aea98bc35..99a4384bb8a5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * Portions Copyright (C) 1992 Drew Eckhardt + */ #ifndef _LINUX_BLKDEV_H #define _LINUX_BLKDEV_H -#include -#include +#include +#include +#include #include #include #include @@ -12,11 +16,15 @@ #include #include #include +#include #include #include #include +#include #include #include +#include +#include struct module; struct request_queue; @@ -33,6 +41,10 @@ struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; +extern const struct device_type disk_type; +extern struct device_type part_type; +extern struct class block_class; + /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 @@ -45,6 +57,144 @@ struct blk_crypto_profile; */ #define BLKCG_MAX_POLS 6 +#define DISK_MAX_PARTS 256 +#define DISK_NAME_LEN 32 + +#define PARTITION_META_INFO_VOLNAMELTH 64 +/* + * Enough for the string representation of any kind of UUID plus NULL. + * EFI UUID is 36 characters. MSDOS UUID is 11 characters. + */ +#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1) + +struct partition_meta_info { + char uuid[PARTITION_META_INFO_UUIDLTH]; + u8 volname[PARTITION_META_INFO_VOLNAMELTH]; +}; + +/** + * DOC: genhd capability flags + * + * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to + * removable media. When set, the device remains present even when media is not + * inserted. Shall not be set for devices which are removed entirely when the + * media is removed. + * + * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, + * doesn't appear in sysfs, and can't be opened from userspace or using + * blkdev_get*. Used for the underlying components of multipath devices. + * + * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not + * scan for partitions from add_disk, and users can't add partitions manually. + * + */ +enum { + GENHD_FL_REMOVABLE = 1 << 0, + GENHD_FL_HIDDEN = 1 << 1, + GENHD_FL_NO_PART = 1 << 2, +}; + +enum { + DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ + DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ +}; + +enum { + /* Poll even if events_poll_msecs is unset */ + DISK_EVENT_FLAG_POLL = 1 << 0, + /* Forward events to udev */ + DISK_EVENT_FLAG_UEVENT = 1 << 1, + /* Block event polling when open for exclusive write */ + DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, +}; + +struct disk_events; +struct badblocks; + +struct blk_integrity { + const struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; +}; + +struct gendisk { + /* + * major/first_minor/minors should not be set by any new driver, the + * block core will take care of allocating them automatically. + */ + int major; + int first_minor; + int minors; + + char disk_name[DISK_NAME_LEN]; /* name of major driver */ + + unsigned short events; /* supported events */ + unsigned short event_flags; /* flags related to event processing */ + + struct xarray part_tbl; + struct block_device *part0; + + const struct block_device_operations *fops; + struct request_queue *queue; + void *private_data; + + int flags; + unsigned long state; +#define GD_NEED_PART_SCAN 0 +#define GD_READ_ONLY 1 +#define GD_DEAD 2 +#define GD_NATIVE_CAPACITY 3 + + struct mutex open_mutex; /* open/close mutex */ + unsigned open_partitions; /* number of open partitions */ + + struct backing_dev_info *bdi; + struct kobject *slave_dir; +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + struct list_head slave_bdevs; +#endif + struct timer_rand_state *random; + atomic_t sync_io; /* RAID */ + struct disk_events *ev; +#ifdef CONFIG_BLK_DEV_INTEGRITY + struct kobject integrity_kobj; +#endif /* CONFIG_BLK_DEV_INTEGRITY */ +#if IS_ENABLED(CONFIG_CDROM) + struct cdrom_device_info *cdi; +#endif + int node_id; + struct badblocks *bb; + struct lockdep_map lockdep_map; + u64 diskseq; +}; + +static inline bool disk_live(struct gendisk *disk) +{ + return !inode_unhashed(disk->part0->bd_inode); +} + +/* + * The gendisk is refcounted by the part0 block_device, and the bd_device + * therein is also used for device model presentation in sysfs. + */ +#define dev_to_disk(device) \ + (dev_to_bdev(device)->bd_disk) +#define disk_to_dev(disk) \ + (&((disk)->part0->bd_device)) + +#if IS_REACHABLE(CONFIG_CDROM) +#define disk_to_cdi(disk) ((disk)->cdi) +#else +#define disk_to_cdi(disk) NULL +#endif + +static inline dev_t disk_devt(struct gendisk *disk) +{ + return MKDEV(disk->major, disk->first_minor); +} + static inline int blk_validate_block_size(unsigned long bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) @@ -596,6 +746,118 @@ static inline unsigned int blk_queue_depth(struct request_queue *q) #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) +int __must_check device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups); +static inline int __must_check add_disk(struct gendisk *disk) +{ + return device_add_disk(NULL, disk, NULL); +} +void del_gendisk(struct gendisk *gp); +void invalidate_disk(struct gendisk *disk); +void set_disk_ro(struct gendisk *disk, bool read_only); +void disk_uevent(struct gendisk *disk, enum kobject_action action); + +static inline int get_disk_ro(struct gendisk *disk) +{ + return disk->part0->bd_read_only || + test_bit(GD_READ_ONLY, &disk->state); +} + +static inline int bdev_read_only(struct block_device *bdev) +{ + return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); +} + +bool set_capacity_and_notify(struct gendisk *disk, sector_t size); +bool disk_force_media_change(struct gendisk *disk, unsigned int events); + +void add_disk_randomness(struct gendisk *disk) __latent_entropy; +void rand_initialize_disk(struct gendisk *disk); + +static inline sector_t get_start_sect(struct block_device *bdev) +{ + return bdev->bd_start_sect; +} + +static inline sector_t bdev_nr_sectors(struct block_device *bdev) +{ + return bdev->bd_nr_sectors; +} + +static inline loff_t bdev_nr_bytes(struct block_device *bdev) +{ + return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT; +} + +static inline sector_t get_capacity(struct gendisk *disk) +{ + return bdev_nr_sectors(disk->part0); +} + +static inline u64 sb_bdev_nr_blocks(struct super_block *sb) +{ + return bdev_nr_sectors(sb->s_bdev) >> + (sb->s_blocksize_bits - SECTOR_SHIFT); +} + +int bdev_disk_changed(struct gendisk *disk, bool invalidate); + +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass); +void put_disk(struct gendisk *disk); +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); + +/** + * blk_alloc_disk - allocate a gendisk structure + * @node_id: numa node to allocate on + * + * Allocate and pre-initialize a gendisk structure for use with BIO based + * drivers. + * + * Context: can sleep + */ +#define blk_alloc_disk(node_id) \ +({ \ + static struct lock_class_key __key; \ + \ + __blk_alloc_disk(node_id, &__key); \ +}) +void blk_cleanup_disk(struct gendisk *disk); + +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)); +#define register_blkdev(major, name) \ + __register_blkdev(major, name, NULL) +void unregister_blkdev(unsigned int major, const char *name); + +bool bdev_check_media_change(struct block_device *bdev); +int __invalidate_device(struct block_device *bdev, bool kill_dirty); +void set_capacity(struct gendisk *disk, sector_t size); + +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); +int bd_register_pending_holders(struct gendisk *disk); +#else +static inline int bd_link_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + return 0; +} +static inline void bd_unlink_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ +} +static inline int bd_register_pending_holders(struct gendisk *disk) +{ + return 0; +} +#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ + +dev_t part_devt(struct gendisk *disk, u8 partno); +void inc_diskseq(struct gendisk *disk); +dev_t blk_lookup_devt(const char *name, int partno); +void blk_request_module(dev_t devt); extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); @@ -1311,6 +1573,7 @@ void invalidate_bdev(struct block_device *bdev); int sync_blockdev(struct block_device *bdev); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); +void printk_all_partitions(void); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1326,7 +1589,11 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -#endif +static inline void printk_all_partitions(void) +{ +} +#endif /* CONFIG_BLOCK */ + int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); diff --git a/include/linux/genhd.h b/include/linux/genhd.h deleted file mode 100644 index aa4bd985dbe5..000000000000 --- a/include/linux/genhd.h +++ /dev/null @@ -1,287 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_GENHD_H -#define _LINUX_GENHD_H - -/* - * genhd.h Copyright (C) 1992 Drew Eckhardt - * Generic hard disk header file by - * Drew Eckhardt - * - * - */ - -#include -#include -#include -#include -#include -#include - -extern const struct device_type disk_type; -extern struct device_type part_type; -extern struct class block_class; - -#define DISK_MAX_PARTS 256 -#define DISK_NAME_LEN 32 - -#define PARTITION_META_INFO_VOLNAMELTH 64 -/* - * Enough for the string representation of any kind of UUID plus NULL. - * EFI UUID is 36 characters. MSDOS UUID is 11 characters. - */ -#define PARTITION_META_INFO_UUIDLTH (UUID_STRING_LEN + 1) - -struct partition_meta_info { - char uuid[PARTITION_META_INFO_UUIDLTH]; - u8 volname[PARTITION_META_INFO_VOLNAMELTH]; -}; - -/** - * DOC: genhd capability flags - * - * ``GENHD_FL_REMOVABLE``: indicates that the block device gives access to - * removable media. When set, the device remains present even when media is not - * inserted. Shall not be set for devices which are removed entirely when the - * media is removed. - * - * ``GENHD_FL_HIDDEN``: the block device is hidden; it doesn't produce events, - * doesn't appear in sysfs, and can't be opened from userspace or using - * blkdev_get*. Used for the underlying components of multipath devices. - * - * ``GENHD_FL_NO_PART``: partition support is disabled. The kernel will not - * scan for partitions from add_disk, and users can't add partitions manually. - * - */ -enum { - GENHD_FL_REMOVABLE = 1 << 0, - GENHD_FL_HIDDEN = 1 << 1, - GENHD_FL_NO_PART = 1 << 2, -}; - -enum { - DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ - DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ -}; - -enum { - /* Poll even if events_poll_msecs is unset */ - DISK_EVENT_FLAG_POLL = 1 << 0, - /* Forward events to udev */ - DISK_EVENT_FLAG_UEVENT = 1 << 1, - /* Block event polling when open for exclusive write */ - DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE = 1 << 2, -}; - -struct disk_events; -struct badblocks; - -struct blk_integrity { - const struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; -}; - -struct gendisk { - /* - * major/first_minor/minors should not be set by any new driver, the - * block core will take care of allocating them automatically. - */ - int major; - int first_minor; - int minors; - - char disk_name[DISK_NAME_LEN]; /* name of major driver */ - - unsigned short events; /* supported events */ - unsigned short event_flags; /* flags related to event processing */ - - struct xarray part_tbl; - struct block_device *part0; - - const struct block_device_operations *fops; - struct request_queue *queue; - void *private_data; - - int flags; - unsigned long state; -#define GD_NEED_PART_SCAN 0 -#define GD_READ_ONLY 1 -#define GD_DEAD 2 -#define GD_NATIVE_CAPACITY 3 - - struct mutex open_mutex; /* open/close mutex */ - unsigned open_partitions; /* number of open partitions */ - - struct backing_dev_info *bdi; - struct kobject *slave_dir; -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED - struct list_head slave_bdevs; -#endif - struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ - struct disk_events *ev; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct kobject integrity_kobj; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ -#if IS_ENABLED(CONFIG_CDROM) - struct cdrom_device_info *cdi; -#endif - int node_id; - struct badblocks *bb; - struct lockdep_map lockdep_map; - u64 diskseq; -}; - -static inline bool disk_live(struct gendisk *disk) -{ - return !inode_unhashed(disk->part0->bd_inode); -} - -/* - * The gendisk is refcounted by the part0 block_device, and the bd_device - * therein is also used for device model presentation in sysfs. - */ -#define dev_to_disk(device) \ - (dev_to_bdev(device)->bd_disk) -#define disk_to_dev(disk) \ - (&((disk)->part0->bd_device)) - -#if IS_REACHABLE(CONFIG_CDROM) -#define disk_to_cdi(disk) ((disk)->cdi) -#else -#define disk_to_cdi(disk) NULL -#endif - -static inline dev_t disk_devt(struct gendisk *disk) -{ - return MKDEV(disk->major, disk->first_minor); -} - -void disk_uevent(struct gendisk *disk, enum kobject_action action); - -/* block/genhd.c */ -int __must_check device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups); -static inline int __must_check add_disk(struct gendisk *disk) -{ - return device_add_disk(NULL, disk, NULL); -} -extern void del_gendisk(struct gendisk *gp); - -void invalidate_disk(struct gendisk *disk); - -void set_disk_ro(struct gendisk *disk, bool read_only); - -static inline int get_disk_ro(struct gendisk *disk) -{ - return disk->part0->bd_read_only || - test_bit(GD_READ_ONLY, &disk->state); -} - -static inline int bdev_read_only(struct block_device *bdev) -{ - return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); -} - -bool set_capacity_and_notify(struct gendisk *disk, sector_t size); -bool disk_force_media_change(struct gendisk *disk, unsigned int events); - -/* drivers/char/random.c */ -extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; -extern void rand_initialize_disk(struct gendisk *disk); - -static inline sector_t get_start_sect(struct block_device *bdev) -{ - return bdev->bd_start_sect; -} - -static inline sector_t bdev_nr_sectors(struct block_device *bdev) -{ - return bdev->bd_nr_sectors; -} - -static inline loff_t bdev_nr_bytes(struct block_device *bdev) -{ - return (loff_t)bdev_nr_sectors(bdev) << SECTOR_SHIFT; -} - -static inline sector_t get_capacity(struct gendisk *disk) -{ - return bdev_nr_sectors(disk->part0); -} - -static inline u64 sb_bdev_nr_blocks(struct super_block *sb) -{ - return bdev_nr_sectors(sb->s_bdev) >> - (sb->s_blocksize_bits - SECTOR_SHIFT); -} - -int bdev_disk_changed(struct gendisk *disk, bool invalidate); - -struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, - struct lock_class_key *lkclass); -extern void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); - -/** - * blk_alloc_disk - allocate a gendisk structure - * @node_id: numa node to allocate on - * - * Allocate and pre-initialize a gendisk structure for use with BIO based - * drivers. - * - * Context: can sleep - */ -#define blk_alloc_disk(node_id) \ -({ \ - static struct lock_class_key __key; \ - \ - __blk_alloc_disk(node_id, &__key); \ -}) -void blk_cleanup_disk(struct gendisk *disk); - -int __register_blkdev(unsigned int major, const char *name, - void (*probe)(dev_t devt)); -#define register_blkdev(major, name) \ - __register_blkdev(major, name, NULL) -void unregister_blkdev(unsigned int major, const char *name); - -bool bdev_check_media_change(struct block_device *bdev); -int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void set_capacity(struct gendisk *disk, sector_t size); - -#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); -int bd_register_pending_holders(struct gendisk *disk); -#else -static inline int bd_link_disk_holder(struct block_device *bdev, - struct gendisk *disk) -{ - return 0; -} -static inline void bd_unlink_disk_holder(struct block_device *bdev, - struct gendisk *disk) -{ -} -static inline int bd_register_pending_holders(struct gendisk *disk) -{ - return 0; -} -#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ - -dev_t part_devt(struct gendisk *disk, u8 partno); -void inc_diskseq(struct gendisk *disk); -dev_t blk_lookup_devt(const char *name, int partno); -void blk_request_module(dev_t devt); -#ifdef CONFIG_BLOCK -void printk_all_partitions(void); -#else /* CONFIG_BLOCK */ -static inline void printk_all_partitions(void) -{ -} -#endif /* CONFIG_BLOCK */ - -#endif /* _LINUX_GENHD_H */ diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 6f7949b2fd8d..abeba356bc3f 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -2,7 +2,7 @@ #ifndef _LINUX_PART_STAT_H #define _LINUX_PART_STAT_H -#include +#include #include struct disk_stats { -- cgit v1.2.3 From 0a3140ea0fae377c9eaa031b7db1670ae422ed47 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 24 Jan 2022 10:11:02 +0100 Subject: block: pass a block_device and opf to blk_next_bio All callers need to set the block_device and operation, so lift that into the common code. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220124091107.642561-15-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 117d7f248ac9..edeae54074ed 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -790,6 +790,7 @@ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) bio->bi_opf |= REQ_NOWAIT; } -struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, + unsigned int nr_pages, unsigned int opf, gfp_t gfp); #endif /* __LINUX_BIO_H */ -- cgit v1.2.3 From 609be1066731fea86436f5f91022f82e592ab456 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:11:03 +0100 Subject: block: pass a block_device and opf to bio_alloc_bioset Pass the block_device and operation that we plan to use this bio for to bio_alloc_bioset to optimize the assigment. NULL/0 can be passed, both for the passthrough case on a raw request_queue and to temporarily avoid refactoring some nasty code. Also move the gfp_mask argument after the nr_vecs argument for a much more logical calling convention matching what most of the kernel does. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220124091107.642561-16-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index edeae54074ed..2f63ae9a71e1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -405,8 +405,9 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); -struct bio *bio_alloc_bioset(gfp_t gfp, unsigned short nr_iovecs, - struct bio_set *bs); +struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, + unsigned int opf, gfp_t gfp_mask, + struct bio_set *bs); struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, struct bio_set *bs); struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); @@ -419,7 +420,7 @@ extern struct bio_set fs_bio_set; static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) { - return bio_alloc_bioset(gfp_mask, nr_iovecs, &fs_bio_set); + return bio_alloc_bioset(NULL, nr_iovecs, 0, gfp_mask, &fs_bio_set); } void submit_bio(struct bio *bio); -- cgit v1.2.3 From b77c88c2100ce6a5ec8126c13599b5a7f6663e32 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:11:04 +0100 Subject: block: pass a block_device and opf to bio_alloc_kiocb Pass the block_device and operation that we plan to use this bio for to bio_alloc_kiocb to optimize the assigment. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220124091107.642561-17-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 2f63ae9a71e1..5c5ada2ebb27 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -408,8 +408,8 @@ extern int bioset_init_from_src(struct bio_set *bs, struct bio_set *src); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask, struct bio_set *bs); -struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, - struct bio_set *bs); +struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, + unsigned short nr_vecs, unsigned int opf, struct bio_set *bs); struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); -- cgit v1.2.3 From 07888c665b405b1cd3577ddebfeb74f4717a84c4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:11:05 +0100 Subject: block: pass a block_device and opf to bio_alloc Pass the block_device and operation that we plan to use this bio for to bio_alloc to optimize the assignment. NULL/0 can be passed, both for the passthrough case on a raw request_queue and to temporarily avoid refactoring some nasty code. Also move the gfp_mask argument after the nr_vecs argument for a much more logical calling convention matching what most of the kernel does. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220124091107.642561-18-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 5c5ada2ebb27..be6ac92913d4 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -418,9 +418,10 @@ extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -static inline struct bio *bio_alloc(gfp_t gfp_mask, unsigned short nr_iovecs) +static inline struct bio *bio_alloc(struct block_device *bdev, + unsigned short nr_vecs, unsigned int opf, gfp_t gfp_mask) { - return bio_alloc_bioset(NULL, nr_iovecs, 0, gfp_mask, &fs_bio_set); + return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set); } void submit_bio(struct bio *bio); -- cgit v1.2.3 From 49add4966d79244013fce35f95c6833fae82b8b1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:11:06 +0100 Subject: block: pass a block_device and opf to bio_init Pass the block_device that we plan to use this bio for and the operation to bio_init to optimize the assignment. A NULL block_device can be passed, both for the passthrough case on a raw request_queue and to temporarily avoid refactoring some nasty code. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220124091107.642561-19-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index be6ac92913d4..41bedf727f59 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -456,8 +456,8 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) struct request_queue; extern int submit_bio_wait(struct bio *bio); -extern void bio_init(struct bio *bio, struct bio_vec *table, - unsigned short max_vecs); +void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, + unsigned short max_vecs, unsigned int opf); extern void bio_uninit(struct bio *); extern void bio_reset(struct bio *); void bio_chain(struct bio *, struct bio *); -- cgit v1.2.3 From a7c50c940477bae89fb2b4f51bd969a2d95d7512 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 24 Jan 2022 10:11:07 +0100 Subject: block: pass a block_device and opf to bio_reset Pass the block_device that we plan to use this bio for and the operation to bio_reset to optimize the assigment. A NULL block_device can be passed, both for the passthrough case on a raw request_queue and to temporarily avoid refactoring some nasty code. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220124091107.642561-20-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 41bedf727f59..18cfe5bb41ea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -459,7 +459,7 @@ extern int submit_bio_wait(struct bio *bio); void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, unsigned int opf); extern void bio_uninit(struct bio *); -extern void bio_reset(struct bio *); +void bio_reset(struct bio *bio, struct block_device *bdev, unsigned int opf); void bio_chain(struct bio *, struct bio *); int bio_add_page(struct bio *, struct page *, unsigned len, unsigned off); @@ -517,13 +517,6 @@ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) bio_associate_blkg(bio); } -static inline void bio_copy_dev(struct bio *dst, struct bio *src) -{ - bio_clear_flag(dst, BIO_REMAPPED); - dst->bi_bdev = src->bi_bdev; - bio_clone_blkg_association(dst, src); -} - /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * -- cgit v1.2.3 From b1f866b013e6e5583f2f0bf4a61d13eddb9a1799 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jan 2022 08:05:48 +0100 Subject: block: remove blk_needs_flush_plug blk_needs_flush_plug fails to account for the cb_list, which needs flushing as well. Remove it and just check if there is a plug instead of poking into the internals of the plug structure. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220127070549.1377856-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99a4384bb8a5..f902a1c2fac0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1055,14 +1055,6 @@ extern void blk_finish_plug(struct blk_plug *); void blk_flush_plug(struct blk_plug *plug, bool from_schedule); -static inline bool blk_needs_flush_plug(struct task_struct *tsk) -{ - struct blk_plug *plug = tsk->plug; - - return plug && - (plug->mq_list || !list_empty(&plug->cb_list)); -} - int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -1086,11 +1078,6 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } -static inline bool blk_needs_flush_plug(struct task_struct *tsk) -{ - return false; -} - static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; -- cgit v1.2.3 From aa8dcccaf32bfdc09f2aff089d5d60c37da5b7b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jan 2022 08:05:49 +0100 Subject: block: check that there is a plug in blk_flush_plug Rename blk_flush_plug to __blk_flush_plug and add a wrapper that includes the NULL check instead of open coding that check everywhere. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220127070549.1377856-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f902a1c2fac0..654163d3b903 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1053,7 +1053,12 @@ extern void blk_start_plug(struct blk_plug *); extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short); extern void blk_finish_plug(struct blk_plug *); -void blk_flush_plug(struct blk_plug *plug, bool from_schedule); +void __blk_flush_plug(struct blk_plug *plug, bool from_schedule); +static inline void blk_flush_plug(struct blk_plug *plug, bool async) +{ + if (plug) + __blk_flush_plug(plug, async); +} int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); -- cgit v1.2.3 From b42c1fc3d55e077d36718ad9800d89100b2aff81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 27 Jan 2022 07:41:25 +0100 Subject: block: fix the kerneldoc for bio_end_io_acct Document the actually existing parameter name. Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220127064125.1314347-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 654163d3b903..3bfc75a2a450 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1520,7 +1520,7 @@ void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, /** * bio_end_io_acct - end I/O accounting for bio based drivers * @bio: bio to end account for - * @start: start time returned by bio_start_io_acct() + * @start_time: start time returned by bio_start_io_acct() */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { -- cgit v1.2.3 From 2651bf680bc2ad9a078b7222b0873145ab4ece07 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 3 Feb 2022 11:28:25 -0800 Subject: block: introduce BLK_STS_OFFLINE Currently, drivers reports BLK_STS_IOERR for devices that are not full online or being removed. This behavior could cause confusion for users, as they are not really I/O errors from the device. Solve this issue with a new state BLK_STS_OFFLINE, which reports "device offline error" in dmesg instead of "I/O error". EIO is intentionally kept to not change user visible return value. Signed-off-by: Song Liu Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220203192827.1370270-2-song@kernel.org Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index fe065c394fff..5561e58d158a 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -153,6 +153,13 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) +/* + * BLK_STS_OFFLINE is returned from the driver when the target device is offline + * or is being taken offline. This could help differentiate the case where a + * device is intentionally being shut down from a real I/O error. + */ +#define BLK_STS_OFFLINE ((__force blk_status_t)17) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with -- cgit v1.2.3 From 56b4b5abcdab6daf71c5536fca2772f178590e06 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Feb 2022 17:01:06 +0100 Subject: block: clone crypto and integrity data in __bio_clone_fast __bio_clone_fast should also clone integrity and crypto data, as a clone without those is incomplete. Right now the only caller that can actually support crypto and integrity data (dm) does it manually for the one callchain that supports these, but we better do it properly in the core. Note that all callers except for the above mentioned one also don't need to handle failure at all, given that the integrity and crypto clones are based on mempool allocations that won't fail for sleeping allocations. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220202160109.108149-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 18cfe5bb41ea..b814361c957b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -413,7 +413,7 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); -extern void __bio_clone_fast(struct bio *, struct bio *); +int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio_set fs_bio_set; -- cgit v1.2.3 From abfc426d1b2fb2176df59851a64223b58ddae7e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Feb 2022 17:01:09 +0100 Subject: block: pass a block_device to bio_clone_fast Pass a block_device to bio_clone_fast and __bio_clone_fast and give the functions more suitable names. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220202160109.108149-14-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index b814361c957b..7523aba4ddf7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -413,8 +413,10 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, struct block_device *bdev, struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned short nr_iovecs); extern void bio_put(struct bio *); -int __bio_clone_fast(struct bio *bio, struct bio *bio_src, gfp_t gfp); -extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); +struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, + gfp_t gfp, struct bio_set *bs); +int bio_init_clone(struct block_device *bdev, struct bio *bio, + struct bio *bio_src, gfp_t gfp); extern struct bio_set fs_bio_set; -- cgit v1.2.3 From 3301bc53358a0eb0a0db65fd7a513cd4cb50c83a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 10 Jan 2022 15:29:45 +0800 Subject: lib/sbitmap: kill 'depth' from sbitmap_word Only the last sbitmap_word can have different depth, and all the others must have same depth of 1U << sb->shift, so not necessary to store it in sbitmap_word, and it can be retrieved easily and efficiently by adding one internal helper of __map_depth(sb, index). Remove 'depth' field from sbitmap_word, then the annotation of ____cacheline_aligned_in_smp for 'word' isn't needed any more. Not see performance effect when running high parallel IOPS test on null_blk. This way saves us one cacheline(usually 64 words) per each sbitmap_word. Cc: Martin Wilck Signed-off-by: Ming Lei Reviewed-by: Martin Wilck Reviewed-by: John Garry Link: https://lore.kernel.org/r/20220110072945.347535-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index 95df357ec009..df3b584b0f0c 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -27,15 +27,10 @@ struct seq_file; * struct sbitmap_word - Word in a &struct sbitmap. */ struct sbitmap_word { - /** - * @depth: Number of bits being used in @word/@cleared - */ - unsigned long depth; - /** * @word: word holding free bits */ - unsigned long word ____cacheline_aligned_in_smp; + unsigned long word; /** * @cleared: word holding cleared bits @@ -164,6 +159,14 @@ struct sbitmap_queue { int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node, bool round_robin, bool alloc_hint); +/* sbitmap internal helper */ +static inline unsigned int __map_depth(const struct sbitmap *sb, int index) +{ + if (index == sb->map_nr - 1) + return sb->depth - (index << sb->shift); + return 1U << sb->shift; +} + /** * sbitmap_free() - Free memory used by a &struct sbitmap. * @sb: Bitmap to free. @@ -251,7 +254,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb, while (scanned < sb->depth) { unsigned long word; unsigned int depth = min_t(unsigned int, - sb->map[index].depth - nr, + __map_depth(sb, index) - nr, sb->depth - scanned); scanned += depth; -- cgit v1.2.3 From 3f607293b74d6acb06571a774a500143c1f0ed2c Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 8 Feb 2022 20:07:04 +0800 Subject: sbitmap: Delete old sbitmap_queue_get_shallow() Since __sbitmap_queue_get_shallow() was introduced in commit c05e66733788 ("sbitmap: add sbitmap_get_shallow() operation"), it has not been used. Delete __sbitmap_queue_get_shallow() and rename public __sbitmap_queue_get_shallow() -> sbitmap_queue_get_shallow() as it is odd to have public __foo but no foo at all. Signed-off-by: John Garry Link: https://lore.kernel.org/r/1644322024-105340-1-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index df3b584b0f0c..dffeb8281c2d 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -135,7 +135,7 @@ struct sbitmap_queue { /** * @min_shallow_depth: The minimum shallow depth which may be passed to - * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). + * sbitmap_queue_get_shallow() */ unsigned int min_shallow_depth; }; @@ -463,7 +463,7 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, unsigned int *offset); /** - * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct + * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption * already disabled. * @sbq: Bitmap queue to allocate from. @@ -475,8 +475,8 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ -int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, - unsigned int shallow_depth); +int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int shallow_depth); /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct @@ -498,32 +498,6 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, return nr; } -/** - * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct - * sbitmap_queue, limiting the depth used from each word. - * @sbq: Bitmap queue to allocate from. - * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to - * sbitmap_queue_clear()). - * @shallow_depth: The maximum number of bits to allocate from a single word. - * See sbitmap_get_shallow(). - * - * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after - * initializing @sbq. - * - * Return: Non-negative allocated bit number if successful, -1 otherwise. - */ -static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, - unsigned int *cpu, - unsigned int shallow_depth) -{ - int nr; - - *cpu = get_cpu(); - nr = __sbitmap_queue_get_shallow(sbq, shallow_depth); - put_cpu(); - return nr; -} - /** * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the * minimum shallow depth that will be used. -- cgit v1.2.3 From d5869fdc189f0f12a954a48d58a48104a2f5d044 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 10 Feb 2022 14:52:22 -0800 Subject: block: introduce block_rq_error tracepoint Currently, rasdaemon uses the existing tracepoint block_rq_complete and filters out non-error cases in order to capture block disk errors. But there are a few problems with this approach: 1. Even kernel trace filter could do the filtering work, there is still some overhead after we enable this tracepoint. 2. The filter is merely based on errno, which does not align with kernel logic to check the errors for print_req_error(). 3. block_rq_complete only provides dev major and minor to identify the block device, it is not convenient to use in user-space. So introduce a new tracepoint block_rq_error just for the error case. With this patch, rasdaemon could switch to block_rq_error. Since the new tracepoint has the similar implementation with block_rq_complete, so move the existing code from TRACE_EVENT block_rq_complete() into new event class block_rq_completion(). Then add event for block_rq_complete and block_rq_err respectively from the newly created event class per the suggestion from Chaitanya Kulkarni. Cc: Jens Axboe Cc: Christoph Hellwig Reviewed-by: Steven Rostedt Signed-off-by: Cong Wang Signed-off-by: Chaitanya Kulkarni Signed-off-by: Yang Shi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220210225222.260069-1-shy828301@gmail.com Signed-off-by: Jens Axboe --- include/trace/events/block.h | 49 ++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 27170e40e8c9..7f4dfbdf12a6 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -100,19 +100,7 @@ TRACE_EVENT(block_rq_requeue, __entry->nr_sector, 0) ); -/** - * block_rq_complete - block IO operation completed by device driver - * @rq: block operations request - * @error: status code - * @nr_bytes: number of completed bytes - * - * The block_rq_complete tracepoint event indicates that some portion - * of operation request has been completed by the device driver. If - * the @rq->bio is %NULL, then there is absolutely no additional work to - * do for the request. If @rq->bio is non-NULL then there is - * additional work required to complete the request. - */ -TRACE_EVENT(block_rq_complete, +DECLARE_EVENT_CLASS(block_rq_completion, TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), @@ -144,6 +132,41 @@ TRACE_EVENT(block_rq_complete, __entry->nr_sector, __entry->error) ); +/** + * block_rq_complete - block IO operation completed by device driver + * @rq: block operations request + * @error: status code + * @nr_bytes: number of completed bytes + * + * The block_rq_complete tracepoint event indicates that some portion + * of operation request has been completed by the device driver. If + * the @rq->bio is %NULL, then there is absolutely no additional work to + * do for the request. If @rq->bio is non-NULL then there is + * additional work required to complete the request. + */ +DEFINE_EVENT(block_rq_completion, block_rq_complete, + + TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), + + TP_ARGS(rq, error, nr_bytes) +); + +/** + * block_rq_error - block IO operation error reported by device driver + * @rq: block operations request + * @error: status code + * @nr_bytes: number of completed bytes + * + * The block_rq_error tracepoint event indicates that some portion + * of operation request has failed as reported by the device driver. + */ +DEFINE_EVENT(block_rq_completion, block_rq_error, + + TP_PROTO(struct request *rq, blk_status_t error, unsigned int nr_bytes), + + TP_ARGS(rq, error, nr_bytes) +); + DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), -- cgit v1.2.3 From 0e51e2ab49a99bc5077760aa083dfa1c3bf9899b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 11 Feb 2022 18:11:47 +0800 Subject: block: remove THROTL_IOPS_MAX No one uses THROTL_IOPS_MAX any more, so remove it. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220211101149.2368042-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index b4de2010fba5..bdc49bd4eef0 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -28,8 +28,6 @@ /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH (INT_MAX / 2) -/* Max limits for throttle policy */ -#define THROTL_IOPS_MAX UINT_MAX #define FC_APPID_LEN 129 -- cgit v1.2.3 From 672fdcf0e7de3b1e39416ac85abf178f023271f1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 11 Feb 2022 18:11:49 +0800 Subject: block: partition include/linux/blk-cgroup.h Partition include/linux/blk-cgroup.h into two parts: one is public part, the other is block layer private part. Suggested by Christoph Hellwig. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220211101149.2368042-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 459 +-------------------------------------------- 1 file changed, 5 insertions(+), 454 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bdc49bd4eef0..f2ad8ed8f777 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -25,12 +25,8 @@ #include #include -/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ -#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) - #define FC_APPID_LEN 129 - #ifdef CONFIG_BLK_CGROUP enum blkg_iostat_type { @@ -42,6 +38,7 @@ enum blkg_iostat_type { }; struct blkcg_gq; +struct blkg_policy_data; struct blkcg { struct cgroup_subsys_state css; @@ -74,36 +71,6 @@ struct blkg_iostat_set { struct blkg_iostat last; }; -/* - * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a - * request_queue (q). This is used by blkcg policies which need to track - * information per blkcg - q pair. - * - * There can be multiple active blkcg policies and each blkg:policy pair is - * represented by a blkg_policy_data which is allocated and freed by each - * policy's pd_alloc/free_fn() methods. A policy can allocate private data - * area by allocating larger data structure which embeds blkg_policy_data - * at the beginning. - */ -struct blkg_policy_data { - /* the blkg and policy id this per-policy data belongs to */ - struct blkcg_gq *blkg; - int plid; -}; - -/* - * Policies that need to keep per-blkcg data which is independent from any - * request_queue associated to it should implement cpd_alloc/free_fn() - * methods. A policy can allocate private data area by allocating larger - * data structure which embeds blkcg_policy_data at the beginning. - * cpd_init() is invoked to let each policy handle per-blkcg data. - */ -struct blkcg_policy_data { - /* the blkcg and policy id this per-policy data belongs to */ - struct blkcg *blkcg; - int plid; -}; - /* association between a blk cgroup and a request queue */ struct blkcg_gq { /* Pointer to the associated request_queue */ @@ -139,120 +106,17 @@ struct blkcg_gq { struct rcu_head rcu_head; }; -typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); -typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); -typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); -typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); -typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, - struct request_queue *q, struct blkcg *blkcg); -typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); -typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); -typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); -typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); -typedef bool (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, - struct seq_file *s); - -struct blkcg_policy { - int plid; - /* cgroup files for the policy */ - struct cftype *dfl_cftypes; - struct cftype *legacy_cftypes; - - /* operations */ - blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; - blkcg_pol_init_cpd_fn *cpd_init_fn; - blkcg_pol_free_cpd_fn *cpd_free_fn; - blkcg_pol_bind_cpd_fn *cpd_bind_fn; - - blkcg_pol_alloc_pd_fn *pd_alloc_fn; - blkcg_pol_init_pd_fn *pd_init_fn; - blkcg_pol_online_pd_fn *pd_online_fn; - blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_free_pd_fn *pd_free_fn; - blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; - blkcg_pol_stat_pd_fn *pd_stat_fn; -}; - -extern struct blkcg blkcg_root; extern struct cgroup_subsys_state * const blkcg_root_css; -extern bool blkcg_debug_stats; - -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint); -int blkcg_init_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); - -/* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); -void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol); - -const char *blkg_dev_name(struct blkcg_gq *blkg); -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total); -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); - -struct blkg_conf_ctx { - struct block_device *bdev; - struct blkcg_gq *blkg; - char *body; -}; - -struct block_device *blkcg_conf_open_bdev(char **inputp); -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); -/** - * blkcg_css - find the current css - * - * Find the css associated with either the kthread or the current task. - * This may return a dying css, so it is up to the caller to use tryget logic - * to confirm it is alive and well. - */ -static inline struct cgroup_subsys_state *blkcg_css(void) -{ - struct cgroup_subsys_state *css; - - css = kthread_blkcg(); - if (css) - return css; - return task_css(current, io_cgrp_id); -} +void blkcg_destroy_blkgs(struct blkcg *blkcg); +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); +void blkcg_maybe_throttle_current(void); static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) { return css ? container_of(css, struct blkcg, css) : NULL; } -/** - * __bio_blkcg - internal, inconsistent version to get blkcg - * - * DO NOT USE. - * This function is inconsistent and consequently is dangerous to use. The - * first part of the function returns a blkcg where a reference is owned by the - * bio. This means it does not need to be rcu protected as it cannot go away - * with the bio owning a reference to it. However, the latter potentially gets - * it from task_css(). This can race against task migration and the cgroup - * dying. It is also semantically different as it must be called rcu protected - * and is susceptible to failure when trying to get a reference to it. - * Therefore, it is not ok to assume that *_get() will always succeed on the - * blkcg returned here. - */ -static inline struct blkcg *__bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_blkg) - return bio->bi_blkg->blkcg; - return css_to_blkcg(blkcg_css()); -} - /** * bio_blkcg - grab the blkcg associated with a bio * @bio: target bio @@ -288,22 +152,6 @@ static inline bool blk_cgroup_congested(void) return ret; } -/** - * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg - * @return: true if this bio needs to be submitted with the root blkg context. - * - * In order to avoid priority inversions we sometimes need to issue a bio as if - * it were attached to the root blkg, and then backcharge to the actual owning - * blkg. The idea is we do bio_blkcg() to look up the actual context for the - * bio and attach the appropriate blkg to the bio. Then we call this helper and - * if it is true run with the root blkg for that queue and then do any - * backcharging to the originating cgroup once the io is complete. - */ -static inline bool bio_issue_as_root_blkg(struct bio *bio) -{ - return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; -} - /** * blkcg_parent - get the parent of a blkcg * @blkcg: blkcg of interest @@ -315,96 +163,6 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) return css_to_blkcg(blkcg->css.parent); } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, - struct request_queue *q, - bool update_hint) -{ - struct blkcg_gq *blkg; - - if (blkcg == &blkcg_root) - return q->root_blkg; - - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - - return blkg_lookup_slowpath(blkcg, q, update_hint); -} - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock. - */ -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, - struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return __blkg_lookup(blkcg, q, false); -} - -/** - * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair - * @q: request_queue of interest - * - * Lookup blkg for @q at the root level. See also blkg_lookup(). - */ -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ - return q->root_blkg; -} - -/** - * blkg_to_pdata - get policy private data - * @blkg: blkg of interest - * @pol: policy of interest - * - * Return pointer to private data associated with the @blkg-@pol pair. - */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) -{ - return blkg ? blkg->pd[pol->plid] : NULL; -} - -static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, - struct blkcg_policy *pol) -{ - return blkcg ? blkcg->cpd[pol->plid] : NULL; -} - -/** - * pdata_to_blkg - get blkg associated with policy private data - * @pd: policy private data of interest - * - * @pd is policy private data. Determine the blkg it's associated with. - */ -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) -{ - return pd ? pd->blkg : NULL; -} - -static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) -{ - return cpd ? cpd->blkcg : NULL; -} - -extern void blkcg_destroy_blkgs(struct blkcg *blkcg); - /** * blkcg_pin_online - pin online state * @blkcg: blkcg of interest @@ -437,231 +195,24 @@ static inline void blkcg_unpin_online(struct blkcg *blkcg) } while (blkcg); } -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); -} - -/** - * blkg_get - get a blkg reference - * @blkg: blkg to get - * - * The caller should be holding an existing reference. - */ -static inline void blkg_get(struct blkcg_gq *blkg) -{ - percpu_ref_get(&blkg->refcnt); -} - -/** - * blkg_tryget - try and get a blkg reference - * @blkg: blkg to get - * - * This is for use when doing an RCU lookup of the blkg. We may be in the midst - * of freeing this blkg, so we can only use it if the refcnt is not zero. - */ -static inline bool blkg_tryget(struct blkcg_gq *blkg) -{ - return blkg && percpu_ref_tryget(&blkg->refcnt); -} - -/** - * blkg_put - put a blkg reference - * @blkg: blkg to put - */ -static inline void blkg_put(struct blkcg_gq *blkg) -{ - percpu_ref_put(&blkg->refcnt); -} - -/** - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may - * update @pos_css by calling css_rightmost_descendant() to skip subtree. - * @p_blkg is included in the iteration and the first node to be visited. - */ -#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blkg_for_each_descendant_post - post-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. @p_blkg is - * included in the iteration and the last node to be visited. - */ -#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -bool __blkcg_punt_bio_submit(struct bio *bio); - -static inline bool blkcg_punt_bio_submit(struct bio *bio) -{ - if (bio->bi_opf & REQ_CGROUP_PUNT) - return __blkcg_punt_bio_submit(bio); - else - return false; -} - -static inline void blkcg_bio_issue_init(struct bio *bio) -{ - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -} - -static inline void blkcg_use_delay(struct blkcg_gq *blkg) -{ - if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) - return; - if (atomic_add_return(1, &blkg->use_delay) == 1) - atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); -} - -static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) -{ - int old = atomic_read(&blkg->use_delay); - - if (WARN_ON_ONCE(old < 0)) - return 0; - if (old == 0) - return 0; - - /* - * We do this song and dance because we can race with somebody else - * adding or removing delay. If we just did an atomic_dec we'd end up - * negative and we'd already be in trouble. We need to subtract 1 and - * then check to see if we were the last delay so we can drop the - * congestion count on the cgroup. - */ - while (old) { - int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1); - if (cur == old) - break; - old = cur; - } - - if (old == 0) - return 0; - if (old == 1) - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); - return 1; -} - -/** - * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount - * @blkg: target blkg - * @delay: delay duration in nsecs - * - * When enabled with this function, the delay is not decayed and must be - * explicitly cleared with blkcg_clear_delay(). Must not be mixed with - * blkcg_[un]use_delay() and blkcg_add_delay() usages. - */ -static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) -{ - int old = atomic_read(&blkg->use_delay); - - /* We only want 1 person setting the congestion count for this blkg. */ - if (!old && atomic_cmpxchg(&blkg->use_delay, old, -1) == old) - atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); - - atomic64_set(&blkg->delay_nsec, delay); -} - -/** - * blkcg_clear_delay - Disable allocator delay mechanism - * @blkg: target blkg - * - * Disable use_delay mechanism. See blkcg_set_delay(). - */ -static inline void blkcg_clear_delay(struct blkcg_gq *blkg) -{ - int old = atomic_read(&blkg->use_delay); - - /* We only want 1 person clearing the congestion count for this blkg. */ - if (old && atomic_cmpxchg(&blkg->use_delay, old, 0) == old) - atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); -} - -void blk_cgroup_bio_start(struct bio *bio); -void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); -void blkcg_maybe_throttle_current(void); #else /* CONFIG_BLK_CGROUP */ struct blkcg { }; -struct blkg_policy_data { -}; - -struct blkcg_policy_data { -}; - struct blkcg_gq { }; -struct blkcg_policy { -}; - #define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL)) static inline void blkcg_maybe_throttle_current(void) { } static inline bool blk_cgroup_congested(void) { return false; } #ifdef CONFIG_BLOCK - static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { } - -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) -{ return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { } - -static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } +#endif /* CONFIG_BLOCK */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) { return NULL; } -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } -static inline void blkg_get(struct blkcg_gq *blkg) { } -static inline void blkg_put(struct blkcg_gq *blkg) { } - -static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } -static inline void blkcg_bio_issue_init(struct bio *bio) { } -static inline void blk_cgroup_bio_start(struct bio *bio) { } - -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) - -#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_BLK_CGROUP_FC_APPID -- cgit v1.2.3 From 28db4711bf48303814dcfd8d41a41106e90bc374 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 11:05:38 +0100 Subject: blk-mq: remove the request_queue argument to blk_insert_cloned_request The request must be submitted to the queue it was allocated for, so remove the extra request_queue argument. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220215100540.3892965-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d319ffa59354..3a41d50b85d3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -952,8 +952,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); void blk_rq_unprep_clone(struct request *rq); -blk_status_t blk_insert_cloned_request(struct request_queue *q, - struct request *rq); +blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; -- cgit v1.2.3 From 76792055c4c8b2472ca1ae48e0ddaf8497529f08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 10:45:10 +0100 Subject: block: add a ->free_disk method Add a method to notify the driver that the gendisk is about to be freed. This allows drivers to tie the lifetime of their private data to that of the gendisk and thus deal with device removal races without expensive synchronization and boilerplate code. A new flag is added so that ->free_disk is only called after a successful call to add_disk, which significantly simplifies the error handling path during probing. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220215094514.3828912-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3bfc75a2a450..f757f9c2871f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -146,6 +146,7 @@ struct gendisk { #define GD_READ_ONLY 1 #define GD_DEAD 2 #define GD_NATIVE_CAPACITY 3 +#define GD_ADDED 4 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ @@ -1464,6 +1465,7 @@ struct block_device_operations { void (*unlock_native_capacity) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); int (*set_read_only)(struct block_device *bdev, bool ro); + void (*free_disk)(struct gendisk *disk); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, -- cgit v1.2.3 From 20f01f163203666010ee1560852590a0c0572726 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 24 Jan 2022 13:59:38 -0800 Subject: blk-crypto: show crypto capabilities in sysfs Add sysfs files that expose the inline encryption capabilities of request queues: /sys/block/$disk/queue/crypto/max_dun_bits /sys/block/$disk/queue/crypto/modes/$mode /sys/block/$disk/queue/crypto/num_keyslots Userspace can use these new files to decide what encryption settings to use, or whether to use inline encryption at all. This also brings the crypto capabilities in line with the other queue properties, which are already discoverable via the queue directory in sysfs. Design notes: - Place the new files in a new subdirectory "crypto" to group them together and to avoid complicating the main "queue" directory. This also makes it possible to replace "crypto" with a symlink later if we ever make the blk_crypto_profiles into real kobjects (see below). - It was necessary to define a new kobject that corresponds to the crypto subdirectory. For now, this kobject just contains a pointer to the blk_crypto_profile. Note that multiple queues (and hence multiple such kobjects) may refer to the same blk_crypto_profile. An alternative design would more closely match the current kernel data structures: the blk_crypto_profile could be a kobject itself, located directly under the host controller device's kobject, while /sys/block/$disk/queue/crypto would be a symlink to it. I decided not to do that for now because it would require a lot more changes, such as no longer embedding blk_crypto_profile in other structures, and also because I'm not sure we can rule out moving the crypto capabilities into 'struct queue_limits' in the future. (Even if multiple queues share the same crypto engine, maybe the supported data unit sizes could differ due to other queue properties.) It would also still be possible to switch to that design later without breaking userspace, by replacing the directory with a symlink. - Use "max_dun_bits" instead of "max_dun_bytes". Currently, the kernel internally stores this value in bytes, but that's an implementation detail. It probably makes more sense to talk about this value in bits, and choosing bits is more future-proof. - "modes" is a sub-subdirectory, since there may be multiple supported crypto modes, sysfs is supposed to have one value per file, and it makes sense to group all the mode files together. - Each mode had to be named. The crypto API names like "xts(aes)" are not appropriate because they don't specify the key size. Therefore, I assigned new names. The exact names chosen are arbitrary, but they happen to match the names used in log messages in fs/crypto/. - The "num_keyslots" file is a bit different from the others in that it is only useful to know for performance reasons. However, it's included as it can still be useful. For example, a user might not want to use inline encryption if there aren't very many keyslots. Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20220124215938.2769-4-ebiggers@kernel.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f757f9c2871f..e19947d84f12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -413,6 +413,7 @@ struct request_queue { #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct blk_crypto_profile *crypto_profile; + struct kobject *crypto_kobject; #endif unsigned int rq_timeout; -- cgit v1.2.3 From 97939610b893de068c82c347d06319cd231a4602 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 4 Mar 2022 19:01:05 +0100 Subject: block: remove bio_devname All callers are gone, so remove this wrapper. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20220304180105.409765-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7523aba4ddf7..4c21f6e69e18 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -491,8 +491,6 @@ static inline void bio_release_pages(struct bio *bio, bool mark_dirty) __bio_release_pages(bio, mark_dirty); } -extern const char *bio_devname(struct bio *bio, char *buffer); - #define bio_dev(bio) \ disk_devt((bio)->bi_bdev->bd_disk) -- cgit v1.2.3 From 4e5cc99e1e485954a9c09872e0eeea570fb2b5a5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 15:32:19 +0800 Subject: blk-mq: manage hctx map via xarray First code becomes more clean by switching to xarray from plain array. Second use-after-free on q->queue_hw_ctx can be fixed because queue_for_each_hw_ctx() may be run when updating nr_hw_queues is in-progress. With this patch, q->hctx_table is defined as xarray, and this structure will share same lifetime with request queue, so queue_for_each_hw_ctx() can use q->hctx_table to lookup hctx reliably. Reported-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220308073219.91173-7-ming.lei@redhat.com [axboe: fix blk_mq_hw_ctx forward declaration] Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 +-- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 3a41d50b85d3..7aa5c54901a9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -917,8 +917,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) } #define queue_for_each_hw_ctx(q, hctx, i) \ - for ((i) = 0; (i) < (q)->nr_hw_queues && \ - ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) + xa_for_each(&(q)->hctx_table, (i), (hctx)) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e19947d84f12..d982171469bc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -355,7 +355,7 @@ struct request_queue { unsigned int queue_depth; /* hw dispatch queues */ - struct blk_mq_hw_ctx **queue_hw_ctx; + struct xarray hctx_table; unsigned int nr_hw_queues; /* -- cgit v1.2.3 From e7f76552277ca79d9dfe4c59f7b3620a98467f9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Mar 2022 06:51:49 +0100 Subject: scsi: don't use disk->private_data to find the scsi_driver Requiring every ULP to have the scsi_drive as first member of the private data is rather fragile and not necessary anyway. Just use the driver hanging off the SCSI device instead. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220308055200.735835-4-hch@lst.de Signed-off-by: Jens Axboe --- include/scsi/scsi_cmnd.h | 9 --------- include/scsi/scsi_driver.h | 9 +++++++-- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 6794d7322cbd..e3a4c67794b1 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -13,7 +13,6 @@ #include struct Scsi_Host; -struct scsi_driver; /* * MAX_COMMAND_SIZE is: @@ -159,14 +158,6 @@ static inline void *scsi_cmd_priv(struct scsi_cmnd *cmd) return cmd + 1; } -/* make sure not to use it with passthrough commands */ -static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) -{ - struct request *rq = scsi_cmd_to_rq(cmd); - - return *(struct scsi_driver **)rq->q->disk->private_data; -} - void scsi_done(struct scsi_cmnd *cmd); extern void scsi_finish_command(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h index 6dffa8555a39..4ce1988b2ba0 100644 --- a/include/scsi/scsi_driver.h +++ b/include/scsi/scsi_driver.h @@ -4,11 +4,10 @@ #include #include +#include struct module; struct request; -struct scsi_cmnd; -struct scsi_device; struct scsi_driver { struct device_driver gendrv; @@ -31,4 +30,10 @@ extern int scsi_register_interface(struct class_interface *); #define scsi_unregister_interface(intf) \ class_interface_unregister(intf) +/* make sure not to use it with passthrough commands */ +static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) +{ + return to_scsi_driver(cmd->device->sdev_gendev.driver); +} + #endif /* _SCSI_SCSI_DRIVER_H */ -- cgit v1.2.3 From aa1b46dcdc7baaf5fec0be25782ef24b26aa209e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 13 Mar 2022 21:15:02 -1000 Subject: block: fix rq-qos breakage from skipping rq_qos_done_bio() a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") made bio_endio() skip rq_qos_done_bio() if BIO_TRACKED is not set. While this fixed a potential oops, it also broke blk-iocost by skipping the done_bio callback for merged bios. Before, whether a bio goes through rq_qos_throttle() or rq_qos_merge(), rq_qos_done_bio() would be called on the bio on completion with BIO_TRACKED distinguishing the former from the latter. rq_qos_done_bio() is not called for bios which wenth through rq_qos_merge(). This royally confuses blk-iocost as the merged bios never finish and are considered perpetually in-flight. One reliably reproducible failure mode is an intermediate cgroup geting stuck active preventing its children from being activated due to the leaf-only rule, leading to loss of control. The following is from resctl-bench protection scenario which emulates isolating a web server like workload from a memory bomb run on an iocost configuration which should yield a reasonable level of protection. # cat /sys/block/nvme2n1/device/model Samsung SSD 970 PRO 512GB # cat /sys/fs/cgroup/io.cost.model 259:0 ctrl=user model=linear rbps=834913556 rseqiops=93622 rrandiops=102913 wbps=618985353 wseqiops=72325 wrandiops=71025 # cat /sys/fs/cgroup/io.cost.qos 259:0 enable=1 ctrl=user rpct=95.00 rlat=18776 wpct=95.00 wlat=8897 min=60.00 max=100.00 # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=242u:336u/2.5m p90=794u:1.4m/7.5m p99=2.7m:8.0m/62.5m max=8.0m:36.4m/350m W p50=221u:323u/1.5m p90=709u:1.2m/5.5m p99=1.5m:2.5m/9.5m max=6.9m:35.9m/350m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 15.90 15.90 15.90 40.05 57.24 59.07 60.01 74.63 74.63 90.35 90.35 58.12 15.82 lat-imp% 0 0 0 0 0 4.55 14.68 15.54 233.5 548.1 548.1 53.88 143.6 Result: isol=58.12:15.82% lat_imp=53.88%:143.6 work_csv=100.0% missing=3.96% The isolation result of 58.12% is close to what this device would show without any IO control. Fix it by introducing a new flag BIO_QOS_MERGED to mark merged bios and calling rq_qos_done_bio() on them too. For consistency and clarity, rename BIO_TRACKED to BIO_QOS_THROTTLED. The flag checks are moved into rq_qos_done_bio() so that it's next to the code paths that set the flags. With the patch applied, the above same benchmark shows: # resctl-bench -m 29.6G -r out.json run protection::scenario=mem-hog,loops=1 ... Memory Hog Summary ================== IO Latency: R p50=123u:84.4u/985u p90=322u:256u/2.5m p99=1.6m:1.4m/9.5m max=11.1m:36.0m/350m W p50=429u:274u/995u p90=1.7m:1.3m/4.5m p99=3.4m:2.7m/11.5m max=7.9m:5.9m/26.5m Isolation and Request Latency Impact Distributions: min p01 p05 p10 p25 p50 p75 p90 p95 p99 max mean stdev isol% 84.91 84.91 89.51 90.73 92.31 94.49 96.36 98.04 98.71 100.0 100.0 94.42 2.81 lat-imp% 0 0 0 0 0 2.81 5.73 11.11 13.92 17.53 22.61 4.10 4.68 Result: isol=94.42:2.81% lat_imp=4.10%:4.68 work_csv=58.34% missing=0% Signed-off-by: Tejun Heo Fixes: a647a524a467 ("block: don't call rq_qos_ops->done_bio if the bio isn't tracked") Cc: stable@vger.kernel.org # v5.15+ Cc: Ming Lei Cc: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/Yi7rdrzQEHjJLGKB@slm.duckdns.org Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 5561e58d158a..0c3563b45fe9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -324,7 +324,8 @@ enum { BIO_TRACE_COMPLETION, /* bio_endio() should trace the final completion * of this bio. */ BIO_CGROUP_ACCT, /* has been accounted to a cgroup */ - BIO_TRACKED, /* set if bio goes through the rq_qos path */ + BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ + BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ BIO_PERCPU_CACHE, /* can participate in per-cpu alloc cache */ -- cgit v1.2.3