From 53a08807c01989c6847bb135d8d43f61c5dfdda5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Dec 2008 12:41:26 +0100 Subject: block: internal dequeue shouldn't start timer blkdev_dequeue_request() and elv_dequeue_request() are equivalent and both start the timeout timer. Barrier code dequeues the original barrier request but doesn't passes the request itself to lower level driver, only broken down proxy requests; however, as the original barrier code goes through the same dequeue path and timeout timer is started on it. If barrier sequence takes long enough, this timer expires but the low level driver has no idea about this request and oops follows. Timeout timer shouldn't have been started on the original barrier request as it never goes through actual IO. This patch unexports elv_dequeue_request(), which has no external user anyway, and makes it operate on elevator proper w/o adding the timer and make blkdev_dequeue_request() call elv_dequeue_request() and add timer. Internal users which don't pass the request to driver - barrier code and end_that_request_last() - are converted to use elv_dequeue_request(). Signed-off-by: Tejun Heo Cc: Mike Anderson Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a135256b272c..9cc7cc5fdce1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -786,6 +786,8 @@ static inline void blk_run_address_space(struct address_space *mapping) blk_run_backing_dev(mapping->backing_dev_info, NULL); } +extern void blkdev_dequeue_request(struct request *req); + /* * blk_end_request() and friends. * __blk_end_request() and end_request() must be called with @@ -820,11 +822,6 @@ extern void blk_update_request(struct request *rq, int error, extern unsigned int blk_rq_bytes(struct request *rq); extern unsigned int blk_rq_cur_bytes(struct request *rq); -static inline void blkdev_dequeue_request(struct request *req) -{ - elv_dequeue_request(req->q, req); -} - /* * Access functions for manipulating queue properties */ -- cgit v1.2.3 From 0e435ac26e3f951d83338ed3d4ab7dc0fe0055bc Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 3 Dec 2008 12:55:08 +0100 Subject: block: fix setting of max_segment_size and seg_boundary mask Fix setting of max_segment_size and seg_boundary mask for stacked md/dm devices. When stacking devices (LVM over MD over SCSI) some of the request queue parameters are not set up correctly in some cases by default, namely max_segment_size and and seg_boundary mask. If you create MD device over SCSI, these attributes are zeroed. Problem become when there is over this mapping next device-mapper mapping - queue attributes are set in DM this way: request_queue max_segment_size seg_boundary_mask SCSI 65536 0xffffffff MD RAID1 0 0 LVM 65536 -1 (64bit) Unfortunately bio_add_page (resp. bio_phys_segments) calculates number of physical segments according to these parameters. During the generic_make_request() is segment cout recalculated and can increase bio->bi_phys_segments count over the allowed limit. (After bio_clone() in stack operation.) Thi is specially problem in CCISS driver, where it produce OOPS here BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); (MAXSEGENTRIES is 31 by default.) Sometimes even this command is enough to cause oops: dd iflag=direct if=/dev// of=/dev/null bs=128000 count=10 This command generates bios with 250 sectors, allocated in 32 4k-pages (last page uses only 1024 bytes). For LVM layer, it allocates bio with 31 segments (still OK for CCISS), unfortunatelly on lower layer it is recalculated to 32 segments and this violates CCISS restriction and triggers BUG_ON(). The patch tries to fix it by: * initializing attributes above in queue request constructor blk_queue_make_request() * make sure that blk_queue_stack_limits() inherits setting (DM uses its own function to set the limits because it blk_queue_stack_limits() was introduced later. It should probably switch to use generic stack limit function too.) * sets the default seg_boundary value in one place (blkdev.h) * use this mask as default in DM (instead of -1, which differs in 64bit) Bugs related to this: https://bugzilla.redhat.com/show_bug.cgi?id=471639 http://bugzilla.kernel.org/show_bug.cgi?id=8672 Signed-off-by: Milan Broz Reviewed-by: Alasdair G Kergon Cc: Neil Brown Cc: FUJITA Tomonori Cc: Tejun Heo Cc: Mike Miller Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cc7cc5fdce1..6dcd30d806cd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -918,6 +918,8 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter); #define MAX_SEGMENT_SIZE 65536 +#define BLK_SEG_BOUNDARY_MASK 0xFFFFFFFFUL + #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) static inline int queue_hardsect_size(struct request_queue *q) -- cgit v1.2.3 From f2f1fa78a155524b849edf359e42a3001ea652c0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Dec 2008 14:49:18 -0800 Subject: Enforce a minimum SG_IO timeout There's no point in having too short SG_IO timeouts, since if the command does end up timing out, we'll end up through the reset sequence that is several seconds long in order to abort the command that timed out. As a result, shorter timeouts than a few seconds simply do not make sense, as the recovery would be longer than the timeout itself. Add a BLK_MIN_SG_TIMEOUT to match the existign BLK_DEFAULT_SG_TIMEOUT. Suggested-by: Alan Cox Acked-by: Tejun Heo Acked-by: Jens Axboe Cc: Jeff Garzik Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6dcd30d806cd..031a315c0509 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -662,6 +662,7 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; * default timeout for SG_IO if none specified */ #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) +#define BLK_MIN_SG_TIMEOUT (7 * HZ) #ifdef CONFIG_BOUNCE extern int init_emergency_isa_pool(void); -- cgit v1.2.3 From 88e740f1654bf28565edd528a060695c1f2b5ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?= Date: Mon, 27 Oct 2008 18:44:46 +0900 Subject: block: add queue flag for paravirt frontend drivers As is the case with SSD devices, we do not want to idle in AS/CFQ when the block device is a paravirt front-end driver. This patch adds a flag (QUEUE_FLAG_VIRT) which should be used by front-end drivers such as virtio_blk and xen-blkfront to indicate a paravirtualized device. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 031a315c0509..482e9600f7a2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -449,6 +449,7 @@ struct request_queue #define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ #define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ static inline int queue_is_locked(struct request_queue *q) { -- cgit v1.2.3 From 64d01dc9e1927e6535627d73f2336c75d1dd3fe2 Mon Sep 17 00:00:00 2001 From: Cheng Renquan Date: Wed, 3 Dec 2008 12:41:39 +0100 Subject: block: use cancel_work_sync() instead of kblockd_flush_work() After many improvements on kblockd_flush_work, it is now identical to cancel_work_sync, so a direct call to cancel_work_sync is suggested. The only difference is that cancel_work_sync is a GPL symbol, so no non-GPL modules anymore. Signed-off-by: Cheng Renquan Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 482e9600f7a2..e9bb73ff1d64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -978,7 +978,6 @@ static inline void put_dev_sector(Sector p) struct work_struct; int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); -void kblockd_flush_work(struct work_struct *work); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) -- cgit v1.2.3 From 313e42999dbc0f234ca5909a236f78f082cb43b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:02 +0900 Subject: block: reorganize QUEUE_ORDERED_* constants Separate out ordering type (drain,) and action masks (preflush, postflush, fua) from visible ordering mode selectors (QUEUE_ORDERED_*). Ordering types are now named QUEUE_ORDERED_BY_* while action masks are named QUEUE_ORDERED_DO_*. This change is necessary to add QUEUE_ORDERED_DO_BAR and make it optional to improve empty barrier implementation. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e9bb73ff1d64..5c92b4432399 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -523,22 +523,29 @@ enum { * TAG_FLUSH : ordering by tag w/ pre and post flushes * TAG_FUA : ordering by tag w/ pre flush and FUA write */ - QUEUE_ORDERED_NONE = 0x00, - QUEUE_ORDERED_DRAIN = 0x01, - QUEUE_ORDERED_TAG = 0x02, - - QUEUE_ORDERED_PREFLUSH = 0x10, - QUEUE_ORDERED_POSTFLUSH = 0x20, - QUEUE_ORDERED_FUA = 0x40, - - QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, - QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, - QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, - QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, + QUEUE_ORDERED_BY_DRAIN = 0x01, + QUEUE_ORDERED_BY_TAG = 0x02, + QUEUE_ORDERED_DO_PREFLUSH = 0x10, + QUEUE_ORDERED_DO_POSTFLUSH = 0x40, + QUEUE_ORDERED_DO_FUA = 0x80, + + QUEUE_ORDERED_NONE = 0x00, + + QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN, + QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_POSTFLUSH, + QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_FUA, + + QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG, + QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_POSTFLUSH, + QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_FUA, /* * Ordered operation sequence -- cgit v1.2.3 From f671620e7d895af221bdfeda751d54fa55ed9546 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:04 +0900 Subject: block: make every barrier action optional In all barrier sequences, the barrier write itself was always assumed to be issued and thus didn't have corresponding control flag. This patch adds QUEUE_ORDERED_DO_BAR and unify action mask handling in start_ordered() such that any barrier action can be skipped. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c92b4432399..b044267009ed 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -526,12 +526,14 @@ enum { QUEUE_ORDERED_BY_DRAIN = 0x01, QUEUE_ORDERED_BY_TAG = 0x02, QUEUE_ORDERED_DO_PREFLUSH = 0x10, + QUEUE_ORDERED_DO_BAR = 0x20, QUEUE_ORDERED_DO_POSTFLUSH = 0x40, QUEUE_ORDERED_DO_FUA = 0x80, QUEUE_ORDERED_NONE = 0x00, - QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN, + QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN | + QUEUE_ORDERED_DO_BAR, QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_POSTFLUSH, @@ -539,7 +541,8 @@ enum { QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_FUA, - QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG, + QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG | + QUEUE_ORDERED_DO_BAR, QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_POSTFLUSH, -- cgit v1.2.3 From 8f11b3e99a1136fcbb67316c3260f085299c0bff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:05 +0900 Subject: block: make barrier completion more robust Barrier completion had the following assumptions. * start_ordered() couldn't finish the whole sequence properly. If all actions are to be skipped, q->ordseq is set correctly but the actual completion was never triggered thus hanging the barrier request. * Drain completion in elv_complete_request() assumed that there's always at least one request in the queue when drain completes. Both assumptions are true but these assumptions need to be removed to improve empty barrier implementation. This patch makes the following changes. * Make start_ordered() use blk_ordered_complete_seq() to mark skipped steps complete and notify __elv_next_request() that it should fetch the next request if the whole barrier has completed inside start_ordered(). * Make drain completion path in elv_complete_request() check whether the queue is empty. Empty queue also indicates drain completion. * While at it, convert 0/1 return from blk_do_ordered() to false/true. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b044267009ed..3c7078e0129d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -866,10 +866,10 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); -extern int blk_do_ordered(struct request_queue *, struct request **); +extern bool blk_do_ordered(struct request_queue *, struct request **); extern unsigned blk_ordered_cur_seq(struct request_queue *); extern unsigned blk_ordered_req_seq(struct request *); -extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int); +extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -- cgit v1.2.3 From 58eea927d2de43dc6f03d1ba2c46e55854b31540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:06 +0900 Subject: block: simplify empty barrier implementation Empty barrier required special handling in __elv_next_request() to complete it without letting the low level driver see it. With previous changes, barrier code is now flexible enough to skip the BAR step using the same barrier sequence selection mechanism. Drop the special handling and mask off q->ordered from start_ordered(). Remove blk_empty_barrier() test which now has no user. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3c7078e0129d..41bbadfd17f6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -596,7 +596,6 @@ enum { #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) #define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) -#define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) /* rq->queuelist of dequeued request must be list_empty() */ #define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist)) -- cgit v1.2.3 From b374d18a4bfce705e4a99ae9f501b53e86ecb283 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 31 Oct 2008 10:05:07 +0100 Subject: block: get rid of elevator_t typedef Just use struct elevator_queue everywhere instead. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/blkdev.h') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 41bbadfd17f6..7035cec583b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -26,7 +26,6 @@ struct scsi_ioctl_command; struct request_queue; struct elevator_queue; -typedef struct elevator_queue elevator_t; struct request_pm_state; struct blk_trace; struct request; @@ -313,7 +312,7 @@ struct request_queue */ struct list_head queue_head; struct request *last_merge; - elevator_t *elevator; + struct elevator_queue *elevator; /* * the queue request freelist, one for reads and one for writes -- cgit v1.2.3