From b54e5ed8f285d62c0d242c4ef9da90937994db02 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2015 11:16:44 +0800 Subject: block: partition: introduce hd_free_part() So the helper can be used in both generic partition case and part0 case. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/genhd.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ec274e0f4ed2..a221220ffcb2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -663,6 +663,12 @@ static inline void hd_struct_put(struct hd_struct *part) __delete_partition(part); } +static inline void hd_free_part(struct hd_struct *part) +{ + free_part_stats(part); + free_part_info(part); +} + /* * Any access of part->nr_sects which is not protected by partition * bd_mutex or gendisk bdev bd_mutex, should be done using this -- cgit v1.2.3 From 6c71013ecb7e2bddbed9f5b95e7aed22c491daa9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2015 11:16:45 +0800 Subject: block: partition: convert percpu ref Percpu refcount is the perfect match for partition's case, and the conversion is quite straight. With the convertion, one pair of atomic inc/dec can be saved for accounting block I/O, which is run in hot path of block I/O. Signed-off-by: Ming Lei Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a221220ffcb2..2adbfa6d02bc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_BLOCK @@ -124,7 +125,7 @@ struct hd_struct { #else struct disk_stats dkstats; #endif - atomic_t ref; + struct percpu_ref ref; struct rcu_head rcu_head; }; @@ -611,7 +612,7 @@ extern struct hd_struct * __must_check add_partition(struct gendisk *disk, sector_t len, int flags, struct partition_meta_info *info); -extern void __delete_partition(struct hd_struct *); +extern void __delete_partition(struct percpu_ref *); extern void delete_partition(struct gendisk *, int); extern void printk_all_partitions(void); @@ -640,33 +641,39 @@ extern ssize_t part_fail_store(struct device *dev, const char *buf, size_t count); #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline void hd_ref_init(struct hd_struct *part) +static inline int hd_ref_init(struct hd_struct *part) { - atomic_set(&part->ref, 1); - smp_mb(); + if (percpu_ref_init(&part->ref, __delete_partition, 0, + GFP_KERNEL)) + return -ENOMEM; + return 0; } static inline void hd_struct_get(struct hd_struct *part) { - atomic_inc(&part->ref); - smp_mb__after_atomic(); + percpu_ref_get(&part->ref); } static inline int hd_struct_try_get(struct hd_struct *part) { - return atomic_inc_not_zero(&part->ref); + return percpu_ref_tryget_live(&part->ref); } static inline void hd_struct_put(struct hd_struct *part) { - if (atomic_dec_and_test(&part->ref)) - __delete_partition(part); + percpu_ref_put(&part->ref); +} + +static inline void hd_struct_kill(struct hd_struct *part) +{ + percpu_ref_kill(&part->ref); } static inline void hd_free_part(struct hd_struct *part) { free_part_stats(part); free_part_info(part); + percpu_ref_exit(&part->ref); } /* -- cgit v1.2.3 From 0034af036554c39eefd14d835a8ec3496ac46712 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 16 Jul 2015 09:14:26 -0600 Subject: block: make /sys/block//queue/discard_max_bytes writeable Lots of devices support huge discard sizes these days. Depending on how the device handles them internally, huge discards can introduce massive latencies (hundreds of msec) on the device side. We have a sysfs file, discard_max_bytes, that advertises the max hardware supported discard size. Make this writeable, and split the settings into a soft and hard limit. This can be set from 'discard_granularity' and up to the hardware limit. Add a new sysfs file, 'discard_max_hw_bytes', that shows the hw set limit. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4068c17d0df..243f29e779ec 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -268,6 +268,7 @@ struct queue_limits { unsigned int io_min; unsigned int io_opt; unsigned int max_discard_sectors; + unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; unsigned int discard_granularity; unsigned int discard_alignment; -- cgit v1.2.3 From 4246a0b63bd8f56a1469b12eafeb875b1041a451 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 20 Jul 2015 15:29:37 +0200 Subject: block: add a bi_error field to struct bio Currently we have two different ways to signal an I/O error on a BIO: (1) by clearing the BIO_UPTODATE flag (2) by returning a Linux errno value to the bi_end_io callback The first one has the drawback of only communicating a single possible error (-EIO), and the second one has the drawback of not beeing persistent when bios are queued up, and are not passed along from child to parent bio in the ever more popular chaining scenario. Having both mechanisms available has the additional drawback of utterly confusing driver authors and introducing bugs where various I/O submitters only deal with one of them, and the others have to add boilerplate code to deal with both kinds of error returns. So add a new bi_error field to store an errno value directly in struct bio and remove the existing mechanisms to clean all this up. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/bio.h | 13 +++++++++---- include/linux/blk_types.h | 4 ++-- include/linux/swap.h | 4 ++-- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 5e963a6d7c14..6b918177002d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -195,8 +195,6 @@ static inline bool bvec_gap_to_prev(struct bio_vec *bprv, unsigned int offset) return offset || ((bprv->bv_offset + bprv->bv_len) & (PAGE_SIZE - 1)); } -#define bio_io_error(bio) bio_endio((bio), -EIO) - /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it @@ -426,7 +424,14 @@ static inline struct bio *bio_clone_kmalloc(struct bio *bio, gfp_t gfp_mask) } -extern void bio_endio(struct bio *, int); +extern void bio_endio(struct bio *); + +static inline void bio_io_error(struct bio *bio) +{ + bio->bi_error = -EIO; + bio_endio(bio); +} + struct request_queue; extern int bio_phys_segments(struct request_queue *, struct bio *); @@ -717,7 +722,7 @@ extern void bio_integrity_free(struct bio *); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_enabled(struct bio *bio); extern int bio_integrity_prep(struct bio *); -extern void bio_integrity_endio(struct bio *, int); +extern void bio_integrity_endio(struct bio *); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 7303b3405520..6164fb8a817b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -14,7 +14,7 @@ struct page; struct block_device; struct io_context; struct cgroup_subsys_state; -typedef void (bio_end_io_t) (struct bio *, int); +typedef void (bio_end_io_t) (struct bio *); typedef void (bio_destructor_t) (struct bio *); /* @@ -53,6 +53,7 @@ struct bio { struct bvec_iter bi_iter; + int bi_error; /* Number of segments in this BIO after * physical address coalescing is performed. */ @@ -111,7 +112,6 @@ struct bio { /* * bio flags */ -#define BIO_UPTODATE 0 /* ok after I/O completion */ #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_CLONED 2 /* doesn't own data */ #define BIO_BOUNCED 3 /* bio is a bounce bio */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 38874729dc5f..31496d201fdc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -373,9 +373,9 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); -extern void end_swap_bio_write(struct bio *bio, int err); +extern void end_swap_bio_write(struct bio *bio); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)); + bio_end_io_t end_write_func); extern int swap_set_page_dirty(struct page *page); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, -- cgit v1.2.3 From b7c44ed9d2fc6b461378c65eaf144ccc80a47772 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Jul 2015 12:37:59 -0600 Subject: block: manipulate bio->bi_flags through helpers Some places use helpers now, others don't. We only have the 'is set' helper, add helpers for setting and clearing flags too. It was a bit of a mess of atomic vs non-atomic access. With BIO_UPTODATE gone, we don't have any risk of concurrent access to the flags. So relax the restriction and don't make any of them atomic. The flags that do have serialization issues (reffed and chained), we already handle those separately. Signed-off-by: Jens Axboe --- include/linux/bio.h | 15 +++++++++++++++ include/linux/blk_types.h | 2 -- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 6b918177002d..986e6e19feb5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -304,6 +304,21 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) atomic_set(&bio->__bi_cnt, count); } +static inline bool bio_flagged(struct bio *bio, unsigned int bit) +{ + return (bio->bi_flags & (1UL << bit)) != 0; +} + +static inline void bio_set_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_flags |= (1UL << bit); +} + +static inline void bio_clear_flag(struct bio *bio, unsigned int bit) +{ + bio->bi_flags &= ~(1UL << bit); +} + enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6164fb8a817b..a765a50e780f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -129,8 +129,6 @@ struct bio { #define BIO_RESET_BITS 13 #define BIO_OWNS_VEC 13 /* bio_free() should free bvec */ -#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) - /* * top 4 bits of bio flags indicate the pool this bio came from */ -- cgit v1.2.3 From 2c68f6dc6e621153a708bef6c569805762da2020 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 28 Jul 2015 13:14:32 -0600 Subject: block: shrink struct bio down to 2 cache lines again Commit bcf2843b3f8f added ->bi_error to cleanup the error passing for struct bio, but that ended up adding 4 bytes and a 4 byte hole to the size of struct bio. For a clean config, that bumped it from 128 bytes, to 136 bytes, on x86-64. The ->bi_flags member is currently an unsigned long, but it fits easily within an int. Change it to an unsigned int, adjust the the pool offset code, and move ->bi_error into the new hole. Then we end up with a 128 byte bio again. Change the bio flag set/clear to use cmpxchg to ensure we don't lose any flags when manipulating them. Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 +++--- include/linux/blk_types.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 986e6e19feb5..b7892a1906bd 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -306,17 +306,17 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) static inline bool bio_flagged(struct bio *bio, unsigned int bit) { - return (bio->bi_flags & (1UL << bit)) != 0; + return (bio->bi_flags & (1U << bit)) != 0; } static inline void bio_set_flag(struct bio *bio, unsigned int bit) { - bio->bi_flags |= (1UL << bit); + bio->bi_flags |= (1U << bit); } static inline void bio_clear_flag(struct bio *bio, unsigned int bit) { - bio->bi_flags &= ~(1UL << bit); + bio->bi_flags &= ~(1U << bit); } enum bip_flags { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a765a50e780f..4b7b4ebaa633 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -46,14 +46,14 @@ struct bvec_iter { struct bio { struct bio *bi_next; /* request queue link */ struct block_device *bi_bdev; - unsigned long bi_flags; /* status, command, etc */ + unsigned int bi_flags; /* status, command, etc */ + int bi_error; unsigned long bi_rw; /* bottom bits READ/WRITE, * top bits priority */ struct bvec_iter bi_iter; - int bi_error; /* Number of segments in this BIO after * physical address coalescing is performed. */ @@ -134,7 +134,7 @@ struct bio { */ #define BIO_POOL_BITS (4) #define BIO_POOL_NONE ((1UL << BIO_POOL_BITS) - 1) -#define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS) +#define BIO_POOL_OFFSET (32 - BIO_POOL_BITS) #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) -- cgit v1.2.3 From 41609892701e26724b8617201f43254cadf2e7ae Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 12 Aug 2015 15:59:45 +0530 Subject: blk-cgroup: Drop unlikely before IS_ERR(_OR_NULL) IS_ERR(_OR_NULL) already contain an 'unlikely' compiler flag and there is no need to do that again from its callers. Drop it. Acked-by: Tejun Heo Signed-off-by: Viresh Kumar Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 1b62d768c7df..a4cd1641e9e2 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -374,7 +374,7 @@ static inline struct request_list *blk_get_rl(struct request_queue *q, * root_rl in such cases. */ blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) + if (IS_ERR(blkg)) goto root_rl; blkg_get(blkg); -- cgit v1.2.3 From 54efd50bfd873e2dbf784e0b21a8027ba4299a3e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Apr 2015 22:37:18 -0700 Subject: block: make generic_make_request handle arbitrarily sized bios The way the block layer is currently written, it goes to great lengths to avoid having to split bios; upper layer code (such as bio_add_page()) checks what the underlying device can handle and tries to always create bios that don't need to be split. But this approach becomes unwieldy and eventually breaks down with stacked devices and devices with dynamic limits, and it adds a lot of complexity. If the block layer could split bios as needed, we could eliminate a lot of complexity elsewhere - particularly in stacked drivers. Code that creates bios can then create whatever size bios are convenient, and more importantly stacked drivers don't have to deal with both their own bio size limitations and the limitations of the (potentially multiple) devices underneath them. In the future this will let us delete merge_bvec_fn and a bunch of other code. We do this by adding calls to blk_queue_split() to the various make_request functions that need it - a few can already handle arbitrary size bios. Note that we add the call _after_ any call to blk_queue_bounce(); this means that blk_queue_split() and blk_recalc_rq_segments() don't need to be concerned with bouncing affecting segment merging. Some make_request_fn() callbacks were simple enough to audit and verify they don't need blk_queue_split() calls. The skipped ones are: * nfhd_make_request (arch/m68k/emu/nfblock.c) * axon_ram_make_request (arch/powerpc/sysdev/axonram.c) * simdisk_make_request (arch/xtensa/platforms/iss/simdisk.c) * brd_make_request (ramdisk - drivers/block/brd.c) * mtip_submit_request (drivers/block/mtip32xx/mtip32xx.c) * loop_make_request * null_queue_bio * bcache's make_request fns Some others are almost certainly safe to remove now, but will be left for future patches. Cc: Jens Axboe Cc: Christoph Hellwig Cc: Al Viro Cc: Ming Lei Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Cc: dm-devel@redhat.com Cc: Lars Ellenberg Cc: drbd-user@lists.linbit.com Cc: Jiri Kosina Cc: Geoff Levand Cc: Jim Paris Cc: Philip Kelleher Cc: Minchan Kim Cc: Nitin Gupta Cc: Oleg Drokin Cc: Andreas Dilger Acked-by: NeilBrown (for the 'md/md.c' bits) Acked-by: Mike Snitzer Reviewed-by: Martin K. Petersen Signed-off-by: Kent Overstreet [dpark: skip more mq-based drivers, resolve merge conflicts, etc.] Signed-off-by: Dongsu Park Signed-off-by: Ming Lin Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 243f29e779ec..ca778d9c7d81 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -463,6 +463,7 @@ struct request_queue { struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; + struct bio_set *bio_split; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ @@ -783,6 +784,8 @@ extern void blk_rq_unprep_clone(struct request *rq); extern int blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern void blk_delay_queue(struct request_queue *, unsigned long); +extern void blk_queue_split(struct request_queue *, struct bio **, + struct bio_set *); extern void blk_recount_segments(struct request_queue *, struct bio *); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, -- cgit v1.2.3 From 8ae126660fddbeebb9251a174e6fa45b6ad8f932 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 27 Apr 2015 23:48:34 -0700 Subject: block: kill merge_bvec_fn() completely As generic_make_request() is now able to handle arbitrarily sized bios, it's no longer necessary for each individual block driver to define its own ->merge_bvec_fn() callback. Remove every invocation completely. Cc: Jens Axboe Cc: Lars Ellenberg Cc: drbd-user@lists.linbit.com Cc: Jiri Kosina Cc: Yehuda Sadeh Cc: Sage Weil Cc: Alex Elder Cc: ceph-devel@vger.kernel.org Cc: Alasdair Kergon Cc: Mike Snitzer Cc: dm-devel@redhat.com Cc: Neil Brown Cc: linux-raid@vger.kernel.org Cc: Christoph Hellwig Cc: "Martin K. Petersen" Acked-by: NeilBrown (for the 'md' bits) Acked-by: Mike Snitzer Signed-off-by: Kent Overstreet [dpark: also remove ->merge_bvec_fn() in dm-thin as well as dm-era-target, and resolve merge conflicts] Signed-off-by: Dongsu Park Signed-off-by: Ming Lin Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 ---------- include/linux/device-mapper.h | 4 ---- 2 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ca778d9c7d81..a1feff54aeab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -213,14 +213,6 @@ typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unprep_rq_fn) (struct request_queue *, struct request *); struct bio_vec; -struct bvec_merge_data { - struct block_device *bi_bdev; - sector_t bi_sector; - unsigned bi_size; - unsigned long bi_rw; -}; -typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *, - struct bio_vec *); typedef void (softirq_done_fn)(struct request *); typedef int (dma_drain_needed_fn)(struct request *); typedef int (lld_busy_fn) (struct request_queue *q); @@ -306,7 +298,6 @@ struct request_queue { make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; - merge_bvec_fn *merge_bvec_fn; softirq_done_fn *softirq_done_fn; rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; @@ -992,7 +983,6 @@ extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn); extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn); -extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 51cc1deb7af3..76d23fa8c7d3 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -82,9 +82,6 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv); typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd, unsigned long arg); -typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size); - /* * These iteration functions are typically used to check (and combine) * properties of underlying devices. @@ -160,7 +157,6 @@ struct target_type { dm_status_fn status; dm_message_fn message; dm_ioctl_fn ioctl; - dm_merge_fn merge; dm_busy_fn busy; dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; -- cgit v1.2.3 From b54ffb73cadcdcff9cc1ae0e11f502407e3e2e4c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 19 May 2015 14:31:01 +0200 Subject: block: remove bio_get_nr_vecs() We can always fill up the bio now, no need to estimate the possible size based on queue parameters. Acked-by: Steven Whitehouse Signed-off-by: Kent Overstreet [hch: rebased and wrote a changelog] Signed-off-by: Christoph Hellwig Signed-off-by: Ming Lin Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index b7892a1906bd..ad7217458812 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -460,7 +460,6 @@ void bio_chain(struct bio *, struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); -extern int bio_get_nr_vecs(struct block_device *); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, const struct iov_iter *, gfp_t); -- cgit v1.2.3 From 30e2bc08b2bb7c069246feee78f7ed4006e130fe Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 13 Aug 2015 14:57:56 -0400 Subject: Revert "block: remove artifical max_hw_sectors cap" This reverts commit 34b48db66e08ca1c1bc07cf305d672ac940268dc. That commit caused performance regressions for streaming I/O workloads on a number of different storage devices, from SATA disks to external RAID arrays. It also managed to trip up some buggy firmware in at least one drive, causing data corruption. The next patch will bump the default max_sectors_kb value to 1280, which will accommodate a 10-data-disk stripe write with chunk size 128k. In the testing I've done using iozone, fio, and aio-stress, a value of 1280 does not show a big performance difference from 512. This will hopefully still help the software RAID setup that Christoph saw the original performance gains with while still not regressing other storage configurations. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a1feff54aeab..9f1e3f8eeed7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1132,6 +1132,7 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, + BLK_DEF_MAX_SECTORS = 1024, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -- cgit v1.2.3 From d2be537c3ba3568acd79cd178327b842e60d035e Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 13 Aug 2015 14:57:57 -0400 Subject: block: bump BLK_DEF_MAX_SECTORS to 2560 A value of 2560 (1280k) will accommodate a 10-data-disk stripe write with chunk size 128k. In the testing I've done using iozone, fio, and aio-stress across a number of different storage devices, a value of 1280 does not show a big performance difference from 512, but will hopefully help software RAID setups using SATA disks, as reported by Christoph. NOTE: drivers/block/aoe/aoeblk.c sets its own max_hw_sectors_kb to BLK_DEF_MAX_SECTORS. So, this patch essentially changes aeoblk to Use a larger maximum sector size, and I did not test this. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9f1e3f8eeed7..e427debc7008 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1132,7 +1132,7 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, - BLK_DEF_MAX_SECTORS = 1024, + BLK_DEF_MAX_SECTORS = 2560, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; -- cgit v1.2.3 From 03100aada96f0645bbcb89aea24c01f02d0ef1fa Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 19 Aug 2015 14:24:05 -0700 Subject: block: Replace SG_GAPS with new queue limits mask The SG_GAPS queue flag caused checks for bio vector alignment against PAGE_SIZE, but the device may have different constraints. This patch adds a queue limits so a driver with such constraints can set to allow requests that would have been unnecessarily split. The new gaps check takes the request_queue as a parameter to simplify the logic around invoking this function. This new limit makes the queue flag redundant, so removing it and all usage. Device-mappers will inherit the correct settings through blk_stack_limits(). Signed-off-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 --------- include/linux/blkdev.h | 21 ++++++++++++++++++++- 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index ad7217458812..b9b6e046b52e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -186,15 +186,6 @@ static inline void *bio_data(struct bio *bio) #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q))) -/* - * Check if adding a bio_vec after bprv with offset would create a gap in - * the SG list. Most drivers don't care about this, but some do. - */ -static inline bool bvec_gap_to_prev(struct bio_vec *bprv, unsigned int offset) -{ - return offset || ((bprv->bv_offset + bprv->bv_len) & (PAGE_SIZE - 1)); -} - /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e427debc7008..a622f270f09e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -250,6 +250,7 @@ struct blk_queue_tag { struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; + unsigned long virt_boundary_mask; unsigned int max_hw_sectors; unsigned int chunk_sectors; @@ -479,7 +480,6 @@ struct request_queue { #define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ #define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ -#define QUEUE_FLAG_SG_GAPS 22 /* queue doesn't support SG gaps */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -981,6 +981,7 @@ extern int blk_queue_dma_drain(struct request_queue *q, void *buf, unsigned int size); extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); +extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn); extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn); extern void blk_queue_dma_alignment(struct request_queue *, int); @@ -1149,6 +1150,11 @@ static inline unsigned long queue_segment_boundary(struct request_queue *q) return q->limits.seg_boundary_mask; } +static inline unsigned long queue_virt_boundary(struct request_queue *q) +{ + return q->limits.virt_boundary_mask; +} + static inline unsigned int queue_max_sectors(struct request_queue *q) { return q->limits.max_sectors; @@ -1349,6 +1355,19 @@ static inline void put_dev_sector(Sector p) page_cache_release(p.v); } +/* + * Check if adding a bio_vec after bprv with offset would create a gap in + * the SG list. Most drivers don't care about this, but some do. + */ +static inline bool bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + if (!queue_virt_boundary(q)) + return false; + return offset || + ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); +} + struct work_struct; int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); -- cgit v1.2.3