diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 22 | ||||
-rw-r--r-- | block/Kconfig.iosched | 16 | ||||
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/blk-barrier.c | 148 | ||||
-rw-r--r-- | block/blk-cgroup.c | 859 | ||||
-rw-r--r-- | block/blk-cgroup.h | 189 | ||||
-rw-r--r-- | block/blk-core.c | 97 | ||||
-rw-r--r-- | block/blk-integrity.c | 3 | ||||
-rw-r--r-- | block/blk-ioc.c | 3 | ||||
-rw-r--r-- | block/blk-lib.c | 233 | ||||
-rw-r--r-- | block/blk-merge.c | 8 | ||||
-rw-r--r-- | block/blk-settings.c | 143 | ||||
-rw-r--r-- | block/blk-sysfs.c | 39 | ||||
-rw-r--r-- | block/blk-tag.c | 1 | ||||
-rw-r--r-- | block/blk-timeout.c | 12 | ||||
-rw-r--r-- | block/bsg.c | 3 | ||||
-rw-r--r-- | block/cfq-iosched.c | 441 | ||||
-rw-r--r-- | block/cfq.h | 115 | ||||
-rw-r--r-- | block/compat_ioctl.c | 1 | ||||
-rw-r--r-- | block/elevator.c | 34 | ||||
-rw-r--r-- | block/genhd.c | 2 | ||||
-rw-r--r-- | block/ioctl.c | 3 | ||||
-rw-r--r-- | block/noop-iosched.c | 1 |
23 files changed, 1770 insertions, 605 deletions
diff --git a/block/Kconfig b/block/Kconfig index e20fbde0875c..9be0b56eaee1 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,28 +77,6 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. -config BLK_CGROUP - bool - depends on CGROUPS - default n - ---help--- - Generic block IO controller cgroup interface. This is the common - cgroup interface which should be used by various IO controlling - policies. - - Currently, CFQ IO scheduler uses it to recognize task groups and - control disk bandwidth allocation (proportional time slice allocation) - to such task groups. - -config DEBUG_BLK_CGROUP - bool - depends on BLK_CGROUP - default n - ---help--- - Enable some debugging help. Currently it stores the cgroup path - in the blk group which can be used by cfq for tracing various - group related activity. - endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b71abfb0d726..3199b76f795d 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,6 +23,8 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" + # If BLK_CGROUP is a module, CFQ has to be built as module. + depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -32,23 +34,15 @@ config IOSCHED_CFQ This is the default I/O scheduler. + Note: If BLK_CGROUP=m, then CFQ can be built only as module. + config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ && CGROUPS - select BLK_CGROUP + depends on IOSCHED_CFQ && BLK_CGROUP default n ---help--- Enable group IO scheduling in CFQ. -config DEBUG_CFQ_IOSCHED - bool "Debug CFQ Scheduling" - depends on CFQ_GROUP_IOSCHED - select DEBUG_BLK_CGROUP - default n - ---help--- - Enable CFQ IO scheduling debugging in CFQ. Currently it makes - blktrace output more verbose. - choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/Makefile b/block/Makefile index cb2d515ebd6e..0bb499a739cd 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 8618d8996fea..0d710c9d403b 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/gfp.h> #include "blk.h" @@ -285,26 +286,31 @@ static void bio_end_empty_barrier(struct bio *bio, int err) set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); clear_bit(BIO_UPTODATE, &bio->bi_flags); } - - complete(bio->bi_private); + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) * @error_sector: error sector + * @flags: BLKDEV_IFL_* flags to control behaviour * * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector, unsigned long flags) { DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q; struct bio *bio; - int ret; + int ret = 0; if (bdev->bd_disk == NULL) return -ENXIO; @@ -313,23 +319,25 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) if (!q) return -ENXIO; - bio = bio_alloc(GFP_KERNEL, 0); + bio = bio_alloc(gfp_mask, 0); bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; bio->bi_bdev = bdev; - submit_bio(WRITE_BARRIER, bio); - - wait_for_completion(&wait); + if (test_bit(BLKDEV_WAIT, &flags)) + bio->bi_private = &wait; - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; + bio_get(bio); + submit_bio(WRITE_BARRIER, bio); + if (test_bit(BLKDEV_WAIT, &flags)) { + wait_for_completion(&wait); + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; + } - ret = 0; if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; else if (!bio_flagged(bio, BIO_UPTODATE)) @@ -339,107 +347,3 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) return ret; } EXPORT_SYMBOL(blkdev_issue_flush); - -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - - if (bio->bi_private) - complete(bio->bi_private); - __free_page(bio_page(bio)); - - bio_put(bio); -} - -/** - * blkdev_issue_discard - queue a discard - * @bdev: blockdev to issue discard for - * @sector: start sector - * @nr_sects: number of sectors to discard - * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: DISCARD_FL_* flags to control behaviour - * - * Description: - * Issue a discard request for the sectors in question. - */ -int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q = bdev_get_queue(bdev); - int type = flags & DISCARD_FL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; - struct bio *bio; - struct page *page; - int ret = 0; - - if (!q) - return -ENXIO; - - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - - while (nr_sects && !ret) { - unsigned int sector_size = q->limits.logical_block_size; - unsigned int max_discard_sectors = - min(q->limits.max_discard_sectors, UINT_MAX >> 9); - - bio = bio_alloc(gfp_mask, 1); - if (!bio) - goto out; - bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; - bio->bi_bdev = bdev; - if (flags & DISCARD_FL_WAIT) - bio->bi_private = &wait; - - /* - * Add a zeroed one-sector payload as that's what - * our current implementations need. If we'll ever need - * more the interface will need revisiting. - */ - page = alloc_page(gfp_mask | __GFP_ZERO); - if (!page) - goto out_free_bio; - if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) - goto out_free_page; - - /* - * And override the bio size - the way discard works we - * touch many more blocks on disk than the actual payload - * length. - */ - if (nr_sects > max_discard_sectors) { - bio->bi_size = max_discard_sectors << 9; - nr_sects -= max_discard_sectors; - sector += max_discard_sectors; - } else { - bio->bi_size = nr_sects << 9; - nr_sects = 0; - } - - bio_get(bio); - submit_bio(type, bio); - - if (flags & DISCARD_FL_WAIT) - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; -out_free_page: - __free_page(page); -out_free_bio: - bio_put(bio); -out: - return -ENOMEM; -} -EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1fa2654db0a6..a6809645d212 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -15,7 +15,12 @@ #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> +#include <linux/blkdev.h> +#include <linux/slab.h> #include "blk-cgroup.h" +#include <linux/genhd.h> + +#define MAX_KEY_LEN 100 static DEFINE_SPINLOCK(blkio_list_lock); static LIST_HEAD(blkio_list); @@ -23,19 +28,56 @@ static LIST_HEAD(blkio_list); struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); -bool blkiocg_css_tryget(struct blkio_cgroup *blkcg) +static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, + struct cgroup *); +static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, + struct task_struct *, bool); +static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, + struct cgroup *, struct task_struct *, bool); +static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); +static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, +#ifdef CONFIG_BLK_CGROUP + /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ + .subsys_id = blkio_subsys_id, +#endif + .use_id = 1, + .module = THIS_MODULE, +}; +EXPORT_SYMBOL_GPL(blkio_subsys); + +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) { - if (!css_tryget(&blkcg->css)) - return false; - return true; + list_add(&pn->node, &blkcg->policy_list); } -EXPORT_SYMBOL_GPL(blkiocg_css_tryget); -void blkiocg_css_put(struct blkio_cgroup *blkcg) +/* Must be called with blkcg->lock held */ +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) { - css_put(&blkcg->css); + list_del(&pn->node); +} + +/* Must be called with blkcg->lock held */ +static struct blkio_policy_node * +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (pn->dev == dev) + return pn; + } + + return NULL; } -EXPORT_SYMBOL_GPL(blkiocg_css_put); struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { @@ -44,13 +86,259 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) } EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); -void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors) +/* + * Add to the appropriate stat variable depending on the request type. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, + bool sync) +{ + if (direction) + stat[BLKIO_STAT_WRITE] += add; + else + stat[BLKIO_STAT_READ] += add; + if (sync) + stat[BLKIO_STAT_SYNC] += add; + else + stat[BLKIO_STAT_ASYNC] += add; +} + +/* + * Decrements the appropriate stat variable if non-zero depending on the + * request type. Panics on value being zero. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) +{ + if (direction) { + BUG_ON(stat[BLKIO_STAT_WRITE] == 0); + stat[BLKIO_STAT_WRITE]--; + } else { + BUG_ON(stat[BLKIO_STAT_READ] == 0); + stat[BLKIO_STAT_READ]--; + } + if (sync) { + BUG_ON(stat[BLKIO_STAT_SYNC] == 0); + stat[BLKIO_STAT_SYNC]--; + } else { + BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); + stat[BLKIO_STAT_ASYNC]--; + } +} + +#ifdef CONFIG_DEBUG_BLK_CGROUP +/* This should be called with the blkg->stats_lock held. */ +static void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) +{ + if (blkio_blkg_waiting(&blkg->stats)) + return; + if (blkg == curr_blkg) + return; + blkg->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&blkg->stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + stats->group_wait_time += now - stats->start_group_wait_time; + blkio_clear_blkg_waiting(stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_end_empty_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + stats->empty_time += now - stats->start_empty_time; + blkio_clear_blkg_empty(stats); +} + +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + BUG_ON(blkio_blkg_idling(&blkg->stats)); + blkg->stats.start_idle_time = sched_clock(); + blkio_mark_blkg_idling(&blkg->stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + +void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + unsigned long long now; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (blkio_blkg_idling(stats)) { + now = sched_clock(); + if (time_after64(now, stats->start_idle_time)) + stats->idle_time += now - stats->start_idle_time; + blkio_clear_blkg_idling(stats); + } + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); + +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->avg_queue_size_sum += + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; + stats->avg_queue_size_samples++; + blkio_update_group_wait_time(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); + +void blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + + if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if(blkio_blkg_empty(stats)) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + stats->start_empty_time = sched_clock(); + blkio_mark_blkg_empty(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); + +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->stats.dequeue += dequeue; +} +EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); +#else +static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) {} +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} +#endif + +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, + sync); + blkio_end_empty_time(&blkg->stats); + blkio_set_start_group_wait_time(blkg, curr_blkg); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); + +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + +void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkg->stats.time += time; + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + struct blkio_group_stats *stats; + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->sectors += bytes >> 9; + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction, + sync); + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes, + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); + +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) { - blkg->time += time; - blkg->sectors += sectors; + struct blkio_group_stats *stats; + unsigned long flags; + unsigned long long now = sched_clock(); + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (time_after64(now, io_start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], + now - io_start_time, direction, sync); + if (time_after64(io_start_time, start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], + io_start_time - start_time, direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); } -EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); +EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); + +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, + sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) @@ -58,14 +346,13 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, unsigned long flags; spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_init(&blkg->stats_lock); rcu_assign_pointer(blkg->key, key); blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); spin_unlock_irqrestore(&blkcg->lock, flags); -#ifdef CONFIG_DEBUG_BLK_CGROUP /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); -#endif blkg->dev = dev; } EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); @@ -89,17 +376,16 @@ int blkiocg_del_blkio_group(struct blkio_group *blkg) rcu_read_lock(); css = css_lookup(&blkio_subsys, blkg->blkcg_id); - if (!css) - goto out; - - blkcg = container_of(css, struct blkio_cgroup, css); - spin_lock_irqsave(&blkcg->lock, flags); - if (!hlist_unhashed(&blkg->blkcg_node)) { - __blkiocg_del_blkio_group(blkg); - ret = 0; + if (css) { + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); } - spin_unlock_irqrestore(&blkcg->lock, flags); -out: + rcu_read_unlock(); return ret; } @@ -142,31 +428,179 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) struct blkio_group *blkg; struct hlist_node *n; struct blkio_policy_type *blkiop; + struct blkio_policy_node *pn; if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) return -EINVAL; blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - spin_lock(&blkio_list_lock); + pn = blkio_policy_search_node(blkcg, blkg->dev); + + if (pn) + continue; + list_for_each_entry(blkiop, &blkio_list, list) blkiop->ops.blkio_update_group_weight_fn(blkg, blkcg->weight); - spin_unlock(&blkio_list_lock); } spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); return 0; } -#define SHOW_FUNCTION_PER_GROUP(__VAR) \ +static int +blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct blkio_group_stats *stats; + struct hlist_node *n; + uint64_t queued[BLKIO_STAT_TOTAL]; + int i; +#ifdef CONFIG_DEBUG_BLK_CGROUP + bool idling, waiting, empty; + unsigned long long now = sched_clock(); +#endif + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + spin_lock(&blkg->stats_lock); + stats = &blkg->stats; +#ifdef CONFIG_DEBUG_BLK_CGROUP + idling = blkio_blkg_idling(stats); + waiting = blkio_blkg_waiting(stats); + empty = blkio_blkg_empty(stats); +#endif + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; + memset(stats, 0, sizeof(struct blkio_group_stats)); + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (idling) { + blkio_mark_blkg_idling(stats); + stats->start_idle_time = now; + } + if (waiting) { + blkio_mark_blkg_waiting(stats); + stats->start_group_wait_time = now; + } + if (empty) { + blkio_mark_blkg_empty(stats); + stats->start_empty_time = now; + } +#endif + spin_unlock(&blkg->stats_lock); + } + spin_unlock_irq(&blkcg->lock); + return 0; +} + +static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, + int chars_left, bool diskname_only) +{ + snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); + chars_left -= strlen(str); + if (chars_left <= 0) { + printk(KERN_WARNING + "Possibly incorrect cgroup stat display format"); + return; + } + if (diskname_only) + return; + switch (type) { + case BLKIO_STAT_READ: + strlcat(str, " Read", chars_left); + break; + case BLKIO_STAT_WRITE: + strlcat(str, " Write", chars_left); + break; + case BLKIO_STAT_SYNC: + strlcat(str, " Sync", chars_left); + break; + case BLKIO_STAT_ASYNC: + strlcat(str, " Async", chars_left); + break; + case BLKIO_STAT_TOTAL: + strlcat(str, " Total", chars_left); + break; + default: + strlcat(str, " Invalid", chars_left); + } +} + +static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, + struct cgroup_map_cb *cb, dev_t dev) +{ + blkio_get_key_name(0, dev, str, chars_left, true); + cb->fill(cb, str, val); + return val; +} + +/* This should be called with blkg->stats_lock held */ +static uint64_t blkio_get_stat(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) +{ + uint64_t disk_total; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.time, cb, dev); + if (type == BLKIO_STAT_SECTORS) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.sectors, cb, dev); +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { + uint64_t sum = blkg->stats.avg_queue_size_sum; + uint64_t samples = blkg->stats.avg_queue_size_samples; + if (samples) + do_div(sum, samples); + else + sum = 0; + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); + } + if (type == BLKIO_STAT_GROUP_WAIT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.group_wait_time, cb, dev); + if (type == BLKIO_STAT_IDLE_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.idle_time, cb, dev); + if (type == BLKIO_STAT_EMPTY_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.empty_time, cb, dev); + if (type == BLKIO_STAT_DEQUEUE) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.dequeue, cb, dev); +#endif + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); + } + disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + + blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; +} + +#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ - struct cftype *cftype, struct seq_file *m) \ + struct cftype *cftype, struct cgroup_map_cb *cb) \ { \ struct blkio_cgroup *blkcg; \ struct blkio_group *blkg; \ struct hlist_node *n; \ + uint64_t cgroup_total = 0; \ \ if (!cgroup_lock_live_group(cgroup)) \ return -ENODEV; \ @@ -174,50 +608,293 @@ static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ blkcg = cgroup_to_blkio_cgroup(cgroup); \ rcu_read_lock(); \ hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ - if (blkg->dev) \ - seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ - MINOR(blkg->dev), blkg->__VAR); \ + if (blkg->dev) { \ + spin_lock_irq(&blkg->stats_lock); \ + cgroup_total += blkio_get_stat(blkg, cb, \ + blkg->dev, type); \ + spin_unlock_irq(&blkg->stats_lock); \ + } \ } \ + if (show_total) \ + cb->fill(cb, "Total", cgroup_total); \ rcu_read_unlock(); \ cgroup_unlock(); \ return 0; \ } -SHOW_FUNCTION_PER_GROUP(time); -SHOW_FUNCTION_PER_GROUP(sectors); +SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0); +SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0); +SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1); +SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1); +SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1); +SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1); +SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1); +SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1); #ifdef CONFIG_DEBUG_BLK_CGROUP -SHOW_FUNCTION_PER_GROUP(dequeue); +SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0); +SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0); +SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0); +SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0); +SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0); #endif #undef SHOW_FUNCTION_PER_GROUP -#ifdef CONFIG_DEBUG_BLK_CGROUP -void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, - unsigned long dequeue) +static int blkio_check_dev_num(dev_t dev) { - blkg->dequeue += dequeue; + int part = 0; + struct gendisk *disk; + + disk = get_gendisk(dev, &part); + if (!disk || part) + return -ENODEV; + + return 0; +} + +static int blkio_policy_parse_and_set(char *buf, + struct blkio_policy_node *newpn) +{ + char *s[4], *p, *major_s = NULL, *minor_s = NULL; + int ret; + unsigned long major, minor, temp; + int i = 0; + dev_t dev; + + memset(s, 0, sizeof(s)); + + while ((p = strsep(&buf, " ")) != NULL) { + if (!*p) + continue; + + s[i++] = p; + + /* Prevent from inputing too many things */ + if (i == 3) + break; + } + + if (i != 2) + return -EINVAL; + + p = strsep(&s[0], ":"); + if (p != NULL) + major_s = p; + else + return -EINVAL; + + minor_s = s[0]; + if (!minor_s) + return -EINVAL; + + ret = strict_strtoul(major_s, 10, &major); + if (ret) + return -EINVAL; + + ret = strict_strtoul(minor_s, 10, &minor); + if (ret) + return -EINVAL; + + dev = MKDEV(major, minor); + + ret = blkio_check_dev_num(dev); + if (ret) + return ret; + + newpn->dev = dev; + + if (s[1] == NULL) + return -EINVAL; + + ret = strict_strtoul(s[1], 10, &temp); + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + return -EINVAL; + + newpn->weight = temp; + + return 0; +} + +unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev); + if (pn) + return pn->weight; + else + return blkcg->weight; +} +EXPORT_SYMBOL_GPL(blkcg_get_weight); + + +static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + int ret = 0; + char *buf; + struct blkio_policy_node *newpn, *pn; + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + int keep_newpn = 0; + struct hlist_node *n; + struct blkio_policy_type *blkiop; + + buf = kstrdup(buffer, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); + if (!newpn) { + ret = -ENOMEM; + goto free_buf; + } + + ret = blkio_policy_parse_and_set(buf, newpn); + if (ret) + goto free_newpn; + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + spin_lock_irq(&blkcg->lock); + + pn = blkio_policy_search_node(blkcg, newpn->dev); + if (!pn) { + if (newpn->weight != 0) { + blkio_policy_insert_node(blkcg, newpn); + keep_newpn = 1; + } + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + + if (newpn->weight == 0) { + /* weight == 0 means deleteing a specific weight */ + blkio_policy_delete_node(pn); + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + spin_unlock_irq(&blkcg->lock); + + pn->weight = newpn->weight; + +update_io_group: + /* update weight for each cfqg */ + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (newpn->dev == blkg->dev) { + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_update_group_weight_fn(blkg, + newpn->weight ? + newpn->weight : + blkcg->weight); + } + } + + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + +free_newpn: + if (!keep_newpn) + kfree(newpn); +free_buf: + kfree(buf); + return ret; +} + +static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct blkio_cgroup *blkcg; + struct blkio_policy_node *pn; + + seq_printf(m, "dev\tweight\n"); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + if (!list_empty(&blkcg->policy_list)) { + spin_lock_irq(&blkcg->lock); + list_for_each_entry(pn, &blkcg->policy_list, node) { + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->weight); + } + spin_unlock_irq(&blkcg->lock); + } + + return 0; } -EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats); -#endif struct cftype blkio_files[] = { { + .name = "weight_device", + .read_seq_string = blkiocg_weight_device_read, + .write_string = blkiocg_weight_device_write, + .max_write_len = 256, + }, + { .name = "weight", .read_u64 = blkiocg_weight_read, .write_u64 = blkiocg_weight_write, }, { .name = "time", - .read_seq_string = blkiocg_time_read, + .read_map = blkiocg_time_read, }, { .name = "sectors", - .read_seq_string = blkiocg_sectors_read, + .read_map = blkiocg_sectors_read, + }, + { + .name = "io_service_bytes", + .read_map = blkiocg_io_service_bytes_read, + }, + { + .name = "io_serviced", + .read_map = blkiocg_io_serviced_read, + }, + { + .name = "io_service_time", + .read_map = blkiocg_io_service_time_read, + }, + { + .name = "io_wait_time", + .read_map = blkiocg_io_wait_time_read, + }, + { + .name = "io_merged", + .read_map = blkiocg_io_merged_read, + }, + { + .name = "io_queued", + .read_map = blkiocg_io_queued_read, + }, + { + .name = "reset_stats", + .write_u64 = blkiocg_reset_stats, }, #ifdef CONFIG_DEBUG_BLK_CGROUP - { + { + .name = "avg_queue_size", + .read_map = blkiocg_avg_queue_size_read, + }, + { + .name = "group_wait_time", + .read_map = blkiocg_group_wait_time_read, + }, + { + .name = "idle_time", + .read_map = blkiocg_idle_time_read, + }, + { + .name = "empty_time", + .read_map = blkiocg_empty_time_read, + }, + { .name = "dequeue", - .read_seq_string = blkiocg_dequeue_read, - }, + .read_map = blkiocg_dequeue_read, + }, #endif }; @@ -234,55 +911,61 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) struct blkio_group *blkg; void *key; struct blkio_policy_type *blkiop; + struct blkio_policy_node *pn, *pntmp; rcu_read_lock(); -remove_entry: - spin_lock_irqsave(&blkcg->lock, flags); + do { + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + break; + } + + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); - if (hlist_empty(&blkcg->blkg_list)) { spin_unlock_irqrestore(&blkcg->lock, flags); - goto done; - } - blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, - blkcg_node); - key = rcu_dereference(blkg->key); - __blkiocg_del_blkio_group(blkg); + /* + * This blkio_group is being unlinked as associated cgroup is + * going away. Let all the IO controlling policies know about + * this event. Currently this is static call to one io + * controlling policy. Once we have more policies in place, we + * need some dynamic registration of callback function. + */ + spin_lock(&blkio_list_lock); + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_unlink_group_fn(key, blkg); + spin_unlock(&blkio_list_lock); + } while (1); - spin_unlock_irqrestore(&blkcg->lock, flags); + list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { + blkio_policy_delete_node(pn); + kfree(pn); + } - /* - * This blkio_group is being unlinked as associated cgroup is going - * away. Let all the IO controlling policies know about this event. - * - * Currently this is static call to one io controlling policy. Once - * we have more policies in place, we need some dynamic registration - * of callback function. - */ - spin_lock(&blkio_list_lock); - list_for_each_entry(blkiop, &blkio_list, list) - blkiop->ops.blkio_unlink_group_fn(key, blkg); - spin_unlock(&blkio_list_lock); - goto remove_entry; -done: free_css_id(&blkio_subsys, &blkcg->css); rcu_read_unlock(); - kfree(blkcg); + if (blkcg != &blkio_root_cgroup) + kfree(blkcg); } static struct cgroup_subsys_state * blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) { - struct blkio_cgroup *blkcg, *parent_blkcg; + struct blkio_cgroup *blkcg; + struct cgroup *parent = cgroup->parent; - if (!cgroup->parent) { + if (!parent) { blkcg = &blkio_root_cgroup; goto done; } /* Currently we do not support hierarchy deeper than two level (0,1) */ - parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); - if (css_depth(&parent_blkcg->css) > 0) + if (parent != cgroup->top_cgroup) return ERR_PTR(-EINVAL); blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); @@ -294,6 +977,7 @@ done: spin_lock_init(&blkcg->lock); INIT_HLIST_HEAD(&blkcg->blkg_list); + INIT_LIST_HEAD(&blkcg->policy_list); return &blkcg->css; } @@ -333,17 +1017,6 @@ static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, task_unlock(tsk); } -struct cgroup_subsys blkio_subsys = { - .name = "blkio", - .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, - .destroy = blkiocg_destroy, - .populate = blkiocg_populate, - .subsys_id = blkio_subsys_id, - .use_id = 1, -}; - void blkio_policy_register(struct blkio_policy_type *blkiop) { spin_lock(&blkio_list_lock); @@ -359,3 +1032,17 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop) spin_unlock(&blkio_list_lock); } EXPORT_SYMBOL_GPL(blkio_policy_unregister); + +static int __init init_cgroup_blkio(void) +{ + return cgroup_load_subsys(&blkio_subsys); +} + +static void __exit exit_cgroup_blkio(void) +{ + cgroup_unload_subsys(&blkio_subsys); +} + +module_init(init_cgroup_blkio); +module_exit(exit_cgroup_blkio); +MODULE_LICENSE("GPL"); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 4d316df863b4..2b866ec1dcea 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,13 +15,92 @@ #include <linux/cgroup.h> -#ifdef CONFIG_BLK_CGROUP +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + +#ifndef CONFIG_BLK_CGROUP +/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ +extern struct cgroup_subsys blkio_subsys; +#define blkio_subsys_id blkio_subsys.subsys_id +#endif + +enum stat_type { + /* Total time spent (in ns) between request dispatch to the driver and + * request completion for IOs doen by this cgroup. This may not be + * accurate when NCQ is turned on. */ + BLKIO_STAT_SERVICE_TIME = 0, + /* Total bytes transferred */ + BLKIO_STAT_SERVICE_BYTES, + /* Total IOs serviced, post merge */ + BLKIO_STAT_SERVICED, + /* Total time spent waiting in scheduler queue in ns */ + BLKIO_STAT_WAIT_TIME, + /* Number of IOs merged */ + BLKIO_STAT_MERGED, + /* Number of IOs queued up */ + BLKIO_STAT_QUEUED, + /* All the single valued stats go below this */ + BLKIO_STAT_TIME, + BLKIO_STAT_SECTORS, +#ifdef CONFIG_DEBUG_BLK_CGROUP + BLKIO_STAT_AVG_QUEUE_SIZE, + BLKIO_STAT_IDLE_TIME, + BLKIO_STAT_EMPTY_TIME, + BLKIO_STAT_GROUP_WAIT_TIME, + BLKIO_STAT_DEQUEUE +#endif +}; + +enum stat_sub_type { + BLKIO_STAT_READ = 0, + BLKIO_STAT_WRITE, + BLKIO_STAT_SYNC, + BLKIO_STAT_ASYNC, + BLKIO_STAT_TOTAL +}; + +/* blkg state flags */ +enum blkg_state_flags { + BLKG_waiting = 0, + BLKG_idling, + BLKG_empty, +}; struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; spinlock_t lock; struct hlist_head blkg_list; + struct list_head policy_list; /* list of blkio_policy_node */ +}; + +struct blkio_group_stats { + /* total disk time and nr sectors dispatched by this group */ + uint64_t time; + uint64_t sectors; + uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Sum of number of IOs queued across all samples */ + uint64_t avg_queue_size_sum; + /* Count of samples taken for average */ + uint64_t avg_queue_size_samples; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; + + /* Total time spent waiting for it to be assigned a timeslice. */ + uint64_t group_wait_time; + uint64_t start_group_wait_time; + + /* Time spent idling for this blkio_group */ + uint64_t idle_time; + uint64_t start_idle_time; + /* + * Total time when we have requests queued and do not contain the + * current active queue. + */ + uint64_t empty_time; + uint64_t start_empty_time; + uint16_t flags; +#endif }; struct blkio_group { @@ -29,22 +108,24 @@ struct blkio_group { void *key; struct hlist_node blkcg_node; unsigned short blkcg_id; -#ifdef CONFIG_DEBUG_BLK_CGROUP /* Store cgroup path */ char path[128]; - /* How many times this group has been removed from service tree */ - unsigned long dequeue; -#endif /* The device MKDEV(major, minor), this group has been created for */ - dev_t dev; + dev_t dev; - /* total disk time and nr sectors dispatched by this group */ - unsigned long time; - unsigned long sectors; + /* Need to serialize the stats in the case of reset/update */ + spinlock_t stats_lock; + struct blkio_group_stats stats; +}; + +struct blkio_policy_node { + struct list_head node; + dev_t dev; + unsigned int weight; }; -extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg); -extern void blkiocg_css_put(struct blkio_cgroup *blkcg); +extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev); typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, @@ -64,6 +145,11 @@ struct blkio_policy_type { extern void blkio_policy_register(struct blkio_policy_type *); extern void blkio_policy_unregister(struct blkio_policy_type *); +static inline char *blkg_path(struct blkio_group *blkg) +{ + return blkg->path; +} + #else struct blkio_group { @@ -75,6 +161,8 @@ struct blkio_policy_type { static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } + #endif #define BLKIO_WEIGHT_MIN 100 @@ -82,19 +170,45 @@ static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } #define BLKIO_WEIGHT_DEFAULT 500 #ifdef CONFIG_DEBUG_BLK_CGROUP -static inline char *blkg_path(struct blkio_group *blkg) -{ - return blkg->path; -} -void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, unsigned long dequeue); +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); +void blkiocg_update_idle_time_stats(struct blkio_group *blkg); +void blkiocg_set_start_empty_time(struct blkio_group *blkg); + +#define BLKG_FLAG_FNS(name) \ +static inline void blkio_mark_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags |= (1 << BLKG_##name); \ +} \ +static inline void blkio_clear_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags &= ~(1 << BLKG_##name); \ +} \ +static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ +{ \ + return (stats->flags & (1 << BLKG_##name)) != 0; \ +} \ + +BLKG_FLAG_FNS(waiting) +BLKG_FLAG_FNS(idling) +BLKG_FLAG_FNS(empty) +#undef BLKG_FLAG_FNS #else -static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } -static inline void blkiocg_update_blkio_group_dequeue_stats( - struct blkio_group *blkg, unsigned long dequeue) {} +static inline void blkiocg_update_avg_queue_size_stats( + struct blkio_group *blkg) {} +static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} +static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{} +static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} +static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} #endif -#ifdef CONFIG_BLK_CGROUP +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, @@ -102,26 +216,43 @@ extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); -void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors); +void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time); +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, + bool direction, bool sync); +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync); +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync); +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync); #else struct cgroup; static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key, dev_t dev) -{ -} + struct blkio_group *blkg, void *key, dev_t dev) {} static inline int blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } static inline struct blkio_group * blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } -static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, - unsigned long time, unsigned long sectors) -{ -} +static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) {} +static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, + bool sync) {} +static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} +static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} #endif #endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 718897e6d37f..f0640d7f800f 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -127,6 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->tag = -1; rq->ref_count = 1; rq->start_time = jiffies; + set_start_time_ns(rq); } EXPORT_SYMBOL(blk_rq_init); @@ -450,6 +451,7 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_sync_queue(q); + del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); mutex_lock(&q->sysfs_lock); queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); mutex_unlock(&q->sysfs_lock); @@ -465,6 +467,9 @@ static int blk_init_free_list(struct request_queue *q) { struct request_list *rl = &q->rq; + if (unlikely(rl->rq_pool)) + return 0; + rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; rl->elvpriv = 0; @@ -510,6 +515,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) return NULL; } + setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, + laptop_mode_timer_fn, (unsigned long) q); init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); @@ -566,16 +573,38 @@ EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { - struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); + struct request_queue *uninit_q, *q; + + uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); + if (!uninit_q) + return NULL; + + q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + if (!q) + blk_cleanup_queue(uninit_q); + + return q; +} +EXPORT_SYMBOL(blk_init_queue_node); + +struct request_queue * +blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, + spinlock_t *lock) +{ + return blk_init_allocated_queue_node(q, rfn, lock, -1); +} +EXPORT_SYMBOL(blk_init_allocated_queue); +struct request_queue * +blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, + spinlock_t *lock, int node_id) +{ if (!q) return NULL; q->node = node_id; - if (blk_init_free_list(q)) { - kmem_cache_free(blk_requestq_cachep, q); + if (blk_init_free_list(q)) return NULL; - } q->request_fn = rfn; q->prep_rq_fn = NULL; @@ -598,10 +627,9 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) return q; } - blk_put_queue(q); return NULL; } -EXPORT_SYMBOL(blk_init_queue_node); +EXPORT_SYMBOL(blk_init_allocated_queue_node); int blk_get_queue(struct request_queue *q) { @@ -1121,13 +1149,10 @@ void init_request_from_bio(struct request *req, struct bio *bio) else req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; - if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { + if (bio_rw_flagged(bio, BIO_RW_DISCARD)) req->cmd_flags |= REQ_DISCARD; - if (bio_rw_flagged(bio, BIO_RW_BARRIER)) - req->cmd_flags |= REQ_SOFTBARRIER; - } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) + if (bio_rw_flagged(bio, BIO_RW_BARRIER)) req->cmd_flags |= REQ_HARDBARRIER; - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) req->cmd_flags |= REQ_RW_SYNC; if (bio_rw_flagged(bio, BIO_RW_META)) @@ -1147,7 +1172,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) */ static inline bool queue_should_plug(struct request_queue *q) { - return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); + return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); } static int __make_request(struct request_queue *q, struct bio *bio) @@ -1198,6 +1223,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; @@ -1231,6 +1257,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; @@ -1490,9 +1517,9 @@ end_io: /* * We only want one ->make_request_fn to be active at a time, * else stack usage with stacked devices could be a problem. - * So use current->bio_{list,tail} to keep a list of requests + * So use current->bio_list to keep a list of requests * submited by a make_request_fn function. - * current->bio_tail is also used as a flag to say if + * current->bio_list is also used as a flag to say if * generic_make_request is currently active in this task or not. * If it is NULL, then no make_request is active. If it is non-NULL, * then a make_request is active, and new requests should be added @@ -1500,11 +1527,11 @@ end_io: */ void generic_make_request(struct bio *bio) { - if (current->bio_tail) { + struct bio_list bio_list_on_stack; + + if (current->bio_list) { /* make_request is active */ - *(current->bio_tail) = bio; - bio->bi_next = NULL; - current->bio_tail = &bio->bi_next; + bio_list_add(current->bio_list, bio); return; } /* following loop may be a bit non-obvious, and so deserves some @@ -1512,30 +1539,27 @@ void generic_make_request(struct bio *bio) * Before entering the loop, bio->bi_next is NULL (as all callers * ensure that) so we have a list with a single bio. * We pretend that we have just taken it off a longer list, so - * we assign bio_list to the next (which is NULL) and bio_tail - * to &bio_list, thus initialising the bio_list of new bios to be + * we assign bio_list to a pointer to the bio_list_on_stack, + * thus initialising the bio_list of new bios to be * added. __generic_make_request may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio - * of the top of the list (no pretending) and so fixup bio_list and - * bio_tail or bi_next, and call into __generic_make_request again. + * of the top of the list (no pretending) and so remove it from + * bio_list, and call into __generic_make_request again. * * The loop was structured like this to make only one call to * __generic_make_request (which is important as it is large and * inlined) and to keep the structure simple. */ BUG_ON(bio->bi_next); + bio_list_init(&bio_list_on_stack); + current->bio_list = &bio_list_on_stack; do { - current->bio_list = bio->bi_next; - if (bio->bi_next == NULL) - current->bio_tail = ¤t->bio_list; - else - bio->bi_next = NULL; __generic_make_request(bio); - bio = current->bio_list; + bio = bio_list_pop(current->bio_list); } while (bio); - current->bio_tail = NULL; /* deactivate */ + current->bio_list = NULL; /* deactivate */ } EXPORT_SYMBOL(generic_make_request); @@ -1559,7 +1583,7 @@ void submit_bio(int rw, struct bio *bio) * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ - if (bio_has_data(bio)) { + if (bio_has_data(bio) && !(rw & (1 << BIO_RW_DISCARD))) { if (rw & WRITE) { count_vm_events(PGPGOUT, count); } else { @@ -1617,8 +1641,7 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) * limitation. */ blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_phys_segments(q) || - rq->nr_phys_segments > queue_max_hw_segments(q)) { + if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } @@ -1861,12 +1884,7 @@ void blk_dequeue_request(struct request *rq) */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; - /* - * Mark this device as supporting hardware queuing, if - * we have more IOs in flight than 4. - */ - if (!blk_queue_queuing(q) && queue_in_flight(q) > 4) - set_bit(QUEUE_FLAG_CQ, &q->queue_flags); + set_io_start_time_ns(rq); } } @@ -2109,7 +2127,7 @@ static void blk_finish_request(struct request *req, int error) BUG_ON(blk_queued_rq(req)); if (unlikely(laptop_mode) && blk_fs_request(req)) - laptop_io_completion(); + laptop_io_completion(&req->q->backing_dev_info); blk_delete_timer(req); @@ -2528,4 +2546,3 @@ int __init blk_dev_init(void) return 0; } - diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 15c630813b1c..edce1ef7933d 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -24,6 +24,7 @@ #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> +#include <linux/slab.h> #include "blk.h" @@ -278,7 +279,7 @@ static struct attribute *integrity_attrs[] = { NULL, }; -static struct sysfs_ops integrity_ops = { +static const struct sysfs_ops integrity_ops = { .show = &integrity_attr_show, .store = &integrity_attr_store, }; diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 98e6bf61b0ac..d22c4c55c406 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -7,6 +7,7 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ +#include <linux/slab.h> #include "blk.h" @@ -91,7 +92,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) spin_lock_init(&ret->lock); ret->ioprio_changed = 0; ret->ioprio = 0; - ret->last_waited = jiffies; /* doesn't matter... */ + ret->last_waited = 0; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); diff --git a/block/blk-lib.c b/block/blk-lib.c new file mode 100644 index 000000000000..d0216b9f22d4 --- /dev/null +++ b/block/blk-lib.c @@ -0,0 +1,233 @@ +/* + * Functions related to generic helpers functions + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/scatterlist.h> + +#include "blk.h" + +static void blkdev_discard_end_io(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + + if (bio->bi_private) + complete(bio->bi_private); + __free_page(bio_page(bio)); + + bio_put(bio); +} + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_IFL_* flags to control behaviour + * + * Description: + * Issue a discard request for the sectors in question. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + int type = flags & BLKDEV_IFL_BARRIER ? + DISCARD_BARRIER : DISCARD_NOBARRIER; + struct bio *bio; + struct page *page; + int ret = 0; + + if (!q) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + while (nr_sects && !ret) { + unsigned int sector_size = q->limits.logical_block_size; + unsigned int max_discard_sectors = + min(q->limits.max_discard_sectors, UINT_MAX >> 9); + + bio = bio_alloc(gfp_mask, 1); + if (!bio) + goto out; + bio->bi_sector = sector; + bio->bi_end_io = blkdev_discard_end_io; + bio->bi_bdev = bdev; + if (flags & BLKDEV_IFL_WAIT) + bio->bi_private = &wait; + + /* + * Add a zeroed one-sector payload as that's what + * our current implementations need. If we'll ever need + * more the interface will need revisiting. + */ + page = alloc_page(gfp_mask | __GFP_ZERO); + if (!page) + goto out_free_bio; + if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) + goto out_free_page; + + /* + * And override the bio size - the way discard works we + * touch many more blocks on disk than the actual payload + * length. + */ + if (nr_sects > max_discard_sectors) { + bio->bi_size = max_discard_sectors << 9; + nr_sects -= max_discard_sectors; + sector += max_discard_sectors; + } else { + bio->bi_size = nr_sects << 9; + nr_sects = 0; + } + + bio_get(bio); + submit_bio(type, bio); + + if (flags & BLKDEV_IFL_WAIT) + wait_for_completion(&wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + else if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + bio_put(bio); + } + return ret; +out_free_page: + __free_page(page); +out_free_bio: + bio_put(bio); +out: + return -ENOMEM; +} +EXPORT_SYMBOL(blkdev_issue_discard); + +struct bio_batch +{ + atomic_t done; + unsigned long flags; + struct completion *wait; + bio_end_io_t *end_io; +}; + +static void bio_batch_end_io(struct bio *bio, int err) +{ + struct bio_batch *bb = bio->bi_private; + + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bb->flags); + else + clear_bit(BIO_UPTODATE, &bb->flags); + } + if (bb) { + if (bb->end_io) + bb->end_io(bio, err); + atomic_inc(&bb->done); + complete(bb->wait); + } + bio_put(bio); +} + +/** + * blkdev_issue_zeroout generate number of zero filed write bios + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_IFL_* flags to control behaviour + * + * Description: + * Generate and issue number of bios with zerofiled pages. + * Send barrier at the beginning and at the end if requested. This guarantie + * correct request ordering. Empty barrier allow us to avoid post queue flush. + */ + +int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + int ret = 0; + struct bio *bio; + struct bio_batch bb; + unsigned int sz, issued = 0; + DECLARE_COMPLETION_ONSTACK(wait); + + atomic_set(&bb.done, 0); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + bb.end_io = NULL; + + if (flags & BLKDEV_IFL_BARRIER) { + /* issue async barrier before the data */ + ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0); + if (ret) + return ret; + } +submit: + while (nr_sects != 0) { + bio = bio_alloc(gfp_mask, + min(nr_sects, (sector_t)BIO_MAX_PAGES)); + if (!bio) + break; + + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = bio_batch_end_io; + if (flags & BLKDEV_IFL_WAIT) + bio->bi_private = &bb; + + while (nr_sects != 0) { + sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); + if (sz == 0) + /* bio has maximum size possible */ + break; + ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); + nr_sects -= ret >> 9; + sector += ret >> 9; + if (ret < (sz << 9)) + break; + } + issued++; + submit_bio(WRITE, bio); + } + /* + * When all data bios are in flight. Send final barrier if requeted. + */ + if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER) + ret = blkdev_issue_flush(bdev, gfp_mask, NULL, + flags & BLKDEV_IFL_WAIT); + + + if (flags & BLKDEV_IFL_WAIT) + /* Wait for bios in-flight */ + while ( issued != atomic_read(&bb.done)) + wait_for_completion(&wait); + + if (!test_bit(BIO_UPTODATE, &bb.flags)) + /* One of bios in the batch was completed with error.*/ + ret = -EIO; + + if (ret) + goto out; + + if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) { + ret = -EOPNOTSUPP; + goto out; + } + if (nr_sects != 0) + goto submit; +out: + return ret; +} +EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-merge.c b/block/blk-merge.c index 99cb5cf1f447..5e7dc9973458 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -206,8 +206,7 @@ static inline int ll_new_hw_segment(struct request_queue *q, { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) || - req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) { + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -300,10 +299,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, total_phys_segments--; } - if (total_phys_segments > queue_max_phys_segments(q)) - return 0; - - if (total_phys_segments > queue_max_hw_segments(q)) + if (total_phys_segments > queue_max_segments(q)) return 0; /* Merge is OK... */ diff --git a/block/blk-settings.c b/block/blk-settings.c index 5eeb9e0d256e..f5ed5a1187ba 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,7 +8,9 @@ #include <linux/blkdev.h> #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ #include <linux/gcd.h> +#include <linux/lcm.h> #include <linux/jiffies.h> +#include <linux/gfp.h> #include "blk.h" @@ -91,10 +93,9 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); */ void blk_set_default_limits(struct queue_limits *lim) { - lim->max_phys_segments = MAX_PHYS_SEGMENTS; - lim->max_hw_segments = MAX_HW_SEGMENTS; + lim->max_segments = BLK_MAX_SEGMENTS; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - lim->max_segment_size = MAX_SEGMENT_SIZE; + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; lim->max_hw_sectors = INT_MAX; lim->max_discard_sectors = 0; @@ -154,7 +155,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->unplug_timer.data = (unsigned long)q; blk_set_default_limits(&q->limits); - blk_queue_max_sectors(q, SAFE_MAX_SECTORS); + blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); /* * If the caller didn't supply a lock, fall back to our embedded @@ -210,37 +211,32 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_queue_max_sectors - set max sectors for a request for this queue + * blk_queue_max_hw_sectors - set max sectors for a request for this queue * @q: the request queue for the device - * @max_sectors: max sectors in the usual 512b unit + * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: - * Enables a low level driver to set an upper limit on the size of - * received requests. + * Enables a low level driver to set a hard upper limit, + * max_hw_sectors, on the size of requests. max_hw_sectors is set by + * the device driver based upon the combined capabilities of I/O + * controller and storage device. + * + * max_sectors is a soft limit imposed by the block layer for + * filesystem type requests. This value can be overridden on a + * per-device basis in /sys/block/<device>/queue/max_sectors_kb. + * The soft limit can not exceed max_hw_sectors. **/ -void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { - if ((max_sectors << 9) < PAGE_CACHE_SIZE) { - max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { + max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_sectors); + __func__, max_hw_sectors); } - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors; - else { - q->limits.max_sectors = BLK_DEF_MAX_SECTORS; - q->limits.max_hw_sectors = max_sectors; - } -} -EXPORT_SYMBOL(blk_queue_max_sectors); - -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors) -{ - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS; - else - q->limits.max_hw_sectors = max_sectors; + q->limits.max_hw_sectors = max_hw_sectors; + q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -257,17 +253,15 @@ void blk_queue_max_discard_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** - * blk_queue_max_phys_segments - set max phys segments for a request for this queue + * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device * @max_segments: max number of segments * * Description: * Enables a low level driver to set an upper limit on the number of - * physical data segments in a request. This would be the largest sized - * scatter list the driver could handle. + * hw data segments in a request. **/ -void blk_queue_max_phys_segments(struct request_queue *q, - unsigned short max_segments) +void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) { if (!max_segments) { max_segments = 1; @@ -275,33 +269,9 @@ void blk_queue_max_phys_segments(struct request_queue *q, __func__, max_segments); } - q->limits.max_phys_segments = max_segments; + q->limits.max_segments = max_segments; } -EXPORT_SYMBOL(blk_queue_max_phys_segments); - -/** - * blk_queue_max_hw_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give at once - * to the device. - **/ -void blk_queue_max_hw_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); - } - - q->limits.max_hw_segments = max_segments; -} -EXPORT_SYMBOL(blk_queue_max_hw_segments); +EXPORT_SYMBOL(blk_queue_max_segments); /** * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg @@ -493,21 +463,11 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) } EXPORT_SYMBOL(blk_queue_stack_limits); -static unsigned int lcm(unsigned int a, unsigned int b) -{ - if (a && b) - return (a * b) / gcd(a, b); - else if (b) - return b; - - return a; -} - /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) * @b: the underlying queue limits (bottom, component device) - * @offset: offset to beginning of data within component device + * @start: first data sector within component device * * Description: * This function is used by stacking drivers like MD and DM to ensure @@ -525,10 +485,9 @@ static unsigned int lcm(unsigned int a, unsigned int b) * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, - sector_t offset) + sector_t start) { - sector_t alignment; - unsigned int top, bottom, ret = 0; + unsigned int top, bottom, alignment, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -537,18 +496,14 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); - t->max_phys_segments = min_not_zero(t->max_phys_segments, - b->max_phys_segments); - - t->max_hw_segments = min_not_zero(t->max_hw_segments, - b->max_hw_segments); + t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); t->misaligned |= b->misaligned; - alignment = queue_limit_alignment_offset(b, offset); + alignment = queue_limit_alignment_offset(b, start); /* Bottom device has different alignment. Check that it is * compatible with the current top alignment. @@ -611,11 +566,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, /* Discard alignment and granularity */ if (b->discard_granularity) { - unsigned int granularity = b->discard_granularity; - offset &= granularity - 1; - - alignment = (granularity + b->discard_alignment - offset) - & (granularity - 1); + alignment = queue_limit_discard_alignment(b, start); if (t->discard_granularity != 0 && t->discard_alignment != alignment) { @@ -657,7 +608,7 @@ int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, start += get_start_sect(bdev); - return blk_stack_limits(t, &bq->limits, start << 9); + return blk_stack_limits(t, &bq->limits, start); } EXPORT_SYMBOL(bdev_stack_limits); @@ -668,9 +619,8 @@ EXPORT_SYMBOL(bdev_stack_limits); * @offset: offset to beginning of data within component device * * Description: - * Merges the limits for two queues. Returns 0 if alignment - * didn't change. Returns -1 if adding the bottom device caused - * misalignment. + * Merges the limits for a top level gendisk and a bottom level + * block_device. */ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) @@ -678,9 +628,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, struct request_queue *t = disk->queue; struct request_queue *b = bdev_get_queue(bdev); - offset += get_start_sect(bdev) << 9; - - if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; disk_name(disk, 0, top); @@ -752,22 +700,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad); * does is adjust the queue so that the buf is always appended * silently to the scatterlist. * - * Note: This routine adjusts max_hw_segments to make room for - * appending the drain buffer. If you call - * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after - * calling this routine, you must set the limit to one fewer than your - * device can support otherwise there won't be room for the drain - * buffer. + * Note: This routine adjusts max_hw_segments to make room for appending + * the drain buffer. If you call blk_queue_max_segments() after calling + * this routine, you must set the limit to one fewer than your device + * can support otherwise there won't be room for the drain buffer. */ int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size) { - if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2) + if (queue_max_segments(q) < 2) return -EINVAL; /* make room for appending the drain */ - blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1); - blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1); + blk_queue_max_segments(q, queue_max_segments(q) - 1); q->dma_drain_needed = dma_drain_needed; q->dma_drain_buffer = buf; q->dma_drain_size = size; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8606c9543fdd..306759bbdf1b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -2,6 +2,7 @@ * Functions related to sysfs handling */ #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> @@ -106,6 +107,19 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_sectors_kb, (page)); } +static ssize_t queue_max_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_segments(q), (page)); +} + +static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + return queue_var_show(queue_max_segment_size(q), (page)); + + return queue_var_show(PAGE_CACHE_SIZE, (page)); +} + static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) { return queue_var_show(queue_logical_block_size(q), page); @@ -189,7 +203,8 @@ static ssize_t queue_nonrot_store(struct request_queue *q, const char *page, static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { - return queue_var_show(blk_queue_nomerges(q), page); + return queue_var_show((blk_queue_nomerges(q) << 1) | + blk_queue_noxmerges(q), page); } static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, @@ -199,10 +214,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, ssize_t ret = queue_var_store(&nm, page, count); spin_lock_irq(q->queue_lock); - if (nm) + queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + if (nm == 2) queue_flag_set(QUEUE_FLAG_NOMERGES, q); - else - queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + else if (nm) + queue_flag_set(QUEUE_FLAG_NOXMERGES, q); spin_unlock_irq(q->queue_lock); return ret; @@ -277,6 +294,16 @@ static struct queue_sysfs_entry queue_max_hw_sectors_entry = { .show = queue_max_hw_sectors_show, }; +static struct queue_sysfs_entry queue_max_segments_entry = { + .attr = {.name = "max_segments", .mode = S_IRUGO }, + .show = queue_max_segments_show, +}; + +static struct queue_sysfs_entry queue_max_segment_size_entry = { + .attr = {.name = "max_segment_size", .mode = S_IRUGO }, + .show = queue_max_segment_size_show, +}; + static struct queue_sysfs_entry queue_iosched_entry = { .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, .show = elv_iosched_show, @@ -352,6 +379,8 @@ static struct attribute *default_attrs[] = { &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, + &queue_max_segments_entry.attr, + &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, @@ -447,7 +476,7 @@ static void blk_release_queue(struct kobject *kobj) kmem_cache_free(blk_requestq_cachep, q); } -static struct sysfs_ops queue_sysfs_ops = { +static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; diff --git a/block/blk-tag.c b/block/blk-tag.c index 6b0f52c20964..ece65fc4c79b 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -5,6 +5,7 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/slab.h> #include "blk.h" diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 1ba7e0aca878..4f0c06c7a338 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -109,6 +109,7 @@ void blk_rq_timed_out_timer(unsigned long data) struct request_queue *q = (struct request_queue *) data; unsigned long flags, next = 0; struct request *rq, *tmp; + int next_set = 0; spin_lock_irqsave(q->queue_lock, flags); @@ -122,16 +123,13 @@ void blk_rq_timed_out_timer(unsigned long data) if (blk_mark_rq_complete(rq)) continue; blk_rq_timed_out(rq); - } else if (!next || time_after(next, rq->deadline)) + } else if (!next_set || time_after(next, rq->deadline)) { next = rq->deadline; + next_set = 1; + } } - /* - * next can never be 0 here with the list non-empty, since we always - * bump ->deadline to 1 so we can detect if the timer was ever added - * or not. See comment in blk_add_timer() - */ - if (next) + if (next_set) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); diff --git a/block/bsg.c b/block/bsg.c index a9fd2d84b53a..82d58829ba59 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -21,6 +21,7 @@ #include <linux/idr.h> #include <linux/bsg.h> #include <linux/smp_lock.h> +#include <linux/slab.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> @@ -260,7 +261,7 @@ bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm, return ERR_PTR(ret); /* - * map scatter-gather elements seperately and string them to request + * map scatter-gather elements separately and string them to request */ rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee130f14d1fc..7982b830db58 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -7,19 +7,20 @@ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> */ #include <linux/module.h> +#include <linux/slab.h> #include <linux/blkdev.h> #include <linux/elevator.h> #include <linux/jiffies.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> -#include "blk-cgroup.h" +#include "cfq.h" /* * tunables */ /* max queue in one round of service */ -static const int cfq_quantum = 4; +static const int cfq_quantum = 8; static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; /* maximum backwards seek, in KiB */ static const int cfq_back_max = 16 * 1024; @@ -42,19 +43,19 @@ static const int cfq_hist_divisor = 4; */ #define CFQ_MIN_TT (2) -/* - * Allow merged cfqqs to perform this amount of seeky I/O before - * deciding to break the queues up again. - */ -#define CFQQ_COOP_TOUT (HZ) - #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) #define CFQ_SERVICE_SHIFT 12 +#define CFQQ_SEEK_THR (sector_t)(8 * 100) +#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) + #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -63,6 +64,9 @@ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); +static DEFINE_SPINLOCK(cic_index_lock); +static DEFINE_IDA(cic_index_ida); + #define CFQ_PRIO_LISTS IOPRIO_BE_NR #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) @@ -80,11 +84,12 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; + unsigned total_weight; u64 min_vdisktime; struct rb_node *active; - unsigned total_weight; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_vdisktime = 0, } /* * Per process-grouping structure @@ -118,11 +123,11 @@ struct cfq_queue { /* time when queue got scheduled in to dispatch first request. */ unsigned long dispatch_start; unsigned int allocated_slice; + unsigned int slice_dispatch; /* time when first request from queue completed and slice started. */ unsigned long slice_start; unsigned long slice_end; long slice_resid; - unsigned int slice_dispatch; /* pending metadata requests */ int meta_pending; @@ -133,20 +138,15 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; - sector_t last_request_pos; - unsigned long seeky_start; - pid_t pid; + u32 seek_history; + sector_t last_request_pos; + struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; struct cfq_group *orig_cfqg; - /* Sectors dispatched in current dispatch round */ - unsigned long nr_sectors; }; /* @@ -227,8 +227,8 @@ struct cfq_data { unsigned int busy_queues; - int rq_in_driver[2]; - int sync_flight; + int rq_in_driver; + int rq_in_flight[2]; /* * queue-depth detection @@ -274,6 +274,7 @@ struct cfq_data { unsigned int cfq_latency; unsigned int cfq_group_isolation; + unsigned int cic_index; struct list_head cic_list; /* @@ -314,6 +315,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ + CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ }; @@ -342,11 +344,12 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); +CFQ_CFQQ_FNS(split_coop); CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS -#ifdef CONFIG_DEBUG_CFQ_IOSCHED +#ifdef CONFIG_CFQ_GROUP_IOSCHED #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ @@ -419,11 +422,6 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); -static inline int rq_in_driver(struct cfq_data *cfqd) -{ - return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; -} - static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, bool is_sync) { @@ -436,6 +434,24 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, cic->cfqq[is_sync] = cfqq; } +#define CIC_DEAD_KEY 1ul +#define CIC_DEAD_INDEX_SHIFT 1 + +static inline void *cfqd_dead_key(struct cfq_data *cfqd) +{ + return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY); +} + +static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic) +{ + struct cfq_data *cfqd = cic->key; + + if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY)) + return NULL; + + return cfqd; +} + /* * We regard a request as SYNC, if it's either a read or has the SYNC bit * set (in which case it could also be direct WRITE). @@ -863,7 +879,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); cfqg->saved_workload_slice = 0; - blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); + cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); } static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) @@ -889,8 +905,7 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) slice_used = cfqq->allocated_slice; } - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, - cfqq->nr_sectors); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); return slice_used; } @@ -924,8 +939,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); - blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, - cfqq->nr_sectors); + cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); + cfq_blkiocg_set_start_empty_time(&cfqg->blkg); } #ifdef CONFIG_CFQ_GROUP_IOSCHED @@ -953,11 +968,12 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; unsigned int major, minor; - /* Do we need to take this reference */ - if (!blkiocg_css_tryget(blkcg)) - return NULL;; - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + goto done; + } if (cfqg || !create) goto done; @@ -965,7 +981,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) if (!cfqg) goto done; - cfqg->weight = blkcg->weight; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); @@ -980,14 +995,14 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) /* Add group onto cgroup list */ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); - blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, MKDEV(major, minor)); + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); /* Add group on cfqd list */ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); done: - blkiocg_css_put(blkcg); return cfqg; } @@ -1009,6 +1024,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) return cfqg; } +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +{ + atomic_inc(&cfqg->ref); + return cfqg; +} + static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { /* Currently, all async queues are mapped to root group */ @@ -1058,7 +1079,7 @@ static void cfq_release_cfq_groups(struct cfq_data *cfqd) * it from cgroup list, then it will take care of destroying * cfqg also. */ - if (!blkiocg_del_blkio_group(&cfqg->blkg)) + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) cfq_destroy_cfqg(cfqd, cfqg); } } @@ -1092,6 +1113,12 @@ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) { return &cfqd->root_group; } + +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +{ + return cfqg; +} + static inline void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { cfqq->cfqg = cfqg; @@ -1394,7 +1421,12 @@ static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); cfq_add_rq_rb(rq); + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); } static struct request * @@ -1422,9 +1454,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver[rq_is_sync(rq)]++; + cfqd->rq_in_driver++; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - rq_in_driver(cfqd)); + cfqd->rq_in_driver); cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } @@ -1432,12 +1464,11 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) static void cfq_deactivate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - const int sync = rq_is_sync(rq); - WARN_ON(!cfqd->rq_in_driver[sync]); - cfqd->rq_in_driver[sync]--; + WARN_ON(!cfqd->rq_in_driver); + cfqd->rq_in_driver--; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - rq_in_driver(cfqd)); + cfqd->rq_in_driver); } static void cfq_remove_request(struct request *rq) @@ -1451,6 +1482,8 @@ static void cfq_remove_request(struct request *rq) cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); if (rq_is_meta(rq)) { WARN_ON(!cfqq->meta_pending); cfqq->meta_pending--; @@ -1482,6 +1515,13 @@ static void cfq_merged_request(struct request_queue *q, struct request *req, } } +static void cfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) +{ + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, + bio_data_dir(bio), cfq_bio_sync(bio)); +} + static void cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) @@ -1499,6 +1539,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, if (cfqq->next_rq == next) cfqq->next_rq = rq; cfq_remove_request(next); + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(next), rq_is_sync(next)); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, @@ -1526,17 +1568,24 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, return cfqq == RQ_CFQQ(rq); } +static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + del_timer(&cfqd->idle_slice_timer); + cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); +} + static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", + cfqd->serving_prio, cfqd->serving_type); + cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; cfqq->allocated_slice = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; - cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -1544,7 +1593,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, cfq_clear_cfqq_fifo_expire(cfqq); cfq_mark_cfqq_slice_new(cfqq); - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); } cfqd->active_queue = cfqq; @@ -1560,12 +1609,21 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); if (cfq_cfqq_wait_request(cfqq)) - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_wait_busy(cfqq); /* + * If this cfqq is shared between multiple processes, check to + * make sure that those processes are still issuing I/Os within + * the mean seek distance. If not, it may be time to break the + * queues apart again. + */ + if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) + cfq_mark_cfqq_split_coop(cfqq); + + /* * store what was left of this slice, if the queue idled/timed out */ if (timed_out && !cfq_cfqq_slice_new(cfqq)) { @@ -1663,22 +1721,10 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, return cfqd->last_position - blk_rq_pos(rq); } -#define CFQQ_SEEK_THR 8 * 1024 -#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) - static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq, bool for_preempt) + struct request *rq) { - sector_t sdist = cfqq->seek_mean; - - if (!sample_valid(cfqq->seek_samples)) - sdist = CFQQ_SEEK_THR; - - /* if seek_mean is big, using it as close criteria is meaningless */ - if (sdist > CFQQ_SEEK_THR && !for_preempt) - sdist = CFQQ_SEEK_THR; - - return cfq_dist_from_last(cfqd, rq) <= sdist; + return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; } static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, @@ -1705,7 +1751,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * will contain the closest sector. */ __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; if (blk_rq_pos(__cfqq->next_rq) < sector) @@ -1716,7 +1762,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, return NULL; __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq, false)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; return NULL; @@ -1737,6 +1783,8 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, { struct cfq_queue *cfqq; + if (cfq_class_idle(cur_cfqq)) + return NULL; if (!cfq_cfqq_sync(cur_cfqq)) return NULL; if (CFQQ_SEEKY(cur_cfqq)) @@ -1803,7 +1851,11 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - return service_tree->count == 1; + if (service_tree->count == 1 && cfq_cfqq_sync(cfqq)) + return 1; + cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", + service_tree->count); + return 0; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) @@ -1848,14 +1900,18 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * time slice. */ if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) + (cfqq->slice_end - jiffies < cic->ttime_mean)) { + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", + cic->ttime_mean); return; + } cfq_mark_cfqq_wait_request(cfqq); sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); + cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); } @@ -1874,9 +1930,9 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqq->dispatched++; elv_dispatch_sort(q, rq); - if (cfq_cfqq_sync(cfqq)) - cfqd->sync_flight++; - cfqq->nr_sectors += blk_rq_sectors(rq); + cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; + cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), + rq_data_dir(rq), rq_is_sync(rq)); } /* @@ -1930,6 +1986,15 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) int process_refs, new_process_refs; struct cfq_queue *__cfqq; + /* + * If there are no process references on the new_cfqq, then it is + * unsafe to follow the ->new_cfqq chain as other cfqq's in the + * chain may have dropped their last reference (not just their + * last process reference). + */ + if (!cfqq_process_refs(new_cfqq)) + return; + /* Avoid a circular list and skip interim queue merges */ while ((__cfqq = new_cfqq->new_cfqq)) { if (__cfqq == cfqq) @@ -1938,17 +2003,17 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) } process_refs = cfqq_process_refs(cfqq); + new_process_refs = cfqq_process_refs(new_cfqq); /* * If the process for the cfqq has gone away, there is no * sense in merging the queues. */ - if (process_refs == 0) + if (process_refs == 0 || new_process_refs == 0) return; /* * Merge in the direction of the lesser amount of work. */ - new_process_refs = cfqq_process_refs(new_cfqq); if (new_process_refs >= process_refs) { cfqq->new_cfqq = new_cfqq; atomic_add(process_refs, &new_cfqq->ref); @@ -2058,6 +2123,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) slice = max(slice, 2 * cfqd->cfq_slice_idle); slice = max_t(unsigned, slice, CFQ_MIN_TT); + cfq_log(cfqd, "workload slice:%d", slice); cfqd->workload_expires = jiffies + slice; cfqd->noidle_tree_requires_idle = false; } @@ -2205,16 +2271,32 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) + /* Expire the timeslice of the current active queue first */ + cfq_slice_expired(cfqd, 0); + while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { + __cfq_set_active_queue(cfqd, cfqq); dispatched += __cfq_forced_dispatch_cfqq(cfqq); + } - cfq_slice_expired(cfqd, 0); BUG_ON(cfqd->busy_queues); cfq_log(cfqd, "forced_dispatch=%d", dispatched); return dispatched; } +static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + /* the queue hasn't finished any request, can't estimate */ + if (cfq_cfqq_slice_new(cfqq)) + return 1; + if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, + cfqq->slice_end)) + return 1; + + return 0; +} + static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; @@ -2222,16 +2304,16 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* * Drain async requests before we start sync IO */ - if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) return false; /* * If this is an async queue and we have sync IO in flight, let it wait */ - if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) + if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) return false; - max_dispatch = cfqd->cfq_quantum; + max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); if (cfq_class_idle(cfqq)) max_dispatch = 1; @@ -2248,13 +2330,22 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* * We have other queues, don't allow more IO from this one */ - if (cfqd->busy_queues > 1) + if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) return false; /* * Sole queue user, no limit */ - max_dispatch = -1; + if (cfqd->busy_queues == 1) + max_dispatch = -1; + else + /* + * Normally we start throttling cfqq when cfq_quantum/2 + * requests have been dispatched. But we can drive + * deeper queue depths at the beginning of slice + * subjected to upper limit of cfq_quantum. + * */ + max_dispatch = cfqd->cfq_quantum; } /* @@ -2450,11 +2541,12 @@ static void cfq_cic_free(struct cfq_io_context *cic) static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic) { unsigned long flags; + unsigned long dead_key = (unsigned long) cic->key; - BUG_ON(!cic->dead_key); + BUG_ON(!(dead_key & CIC_DEAD_KEY)); spin_lock_irqsave(&ioc->lock, flags); - radix_tree_delete(&ioc->radix_root, cic->dead_key); + radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2477,15 +2569,10 @@ static void cfq_free_io_context(struct io_context *ioc) __call_for_each_cic(ioc, cic_free_func); } -static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static void cfq_put_cooperator(struct cfq_queue *cfqq) { struct cfq_queue *__cfqq, *next; - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - /* * If this queue was scheduled to merge with another queue, be * sure to drop the reference taken on that queue (and others in @@ -2501,6 +2588,16 @@ static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfq_put_queue(__cfqq); __cfqq = next; } +} + +static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + if (unlikely(cfqq == cfqd->active_queue)) { + __cfq_slice_expired(cfqd, cfqq, 0); + cfq_schedule_dispatch(cfqd); + } + + cfq_put_cooperator(cfqq); cfq_put_queue(cfqq); } @@ -2513,11 +2610,10 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, list_del_init(&cic->queue_list); /* - * Make sure key == NULL is seen for dead queues + * Make sure dead mark is seen for dead queues */ smp_wmb(); - cic->dead_key = (unsigned long) cic->key; - cic->key = NULL; + cic->key = cfqd_dead_key(cfqd); if (ioc->ioc_data == cic) rcu_assign_pointer(ioc->ioc_data, NULL); @@ -2536,7 +2632,7 @@ static void __cfq_exit_single_io_context(struct cfq_data *cfqd, static void cfq_exit_single_io_context(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); if (cfqd) { struct request_queue *q = cfqd->queue; @@ -2549,7 +2645,7 @@ static void cfq_exit_single_io_context(struct io_context *ioc, * race between exiting task and queue */ smp_read_barrier_depends(); - if (cic->key) + if (cic->key == cfqd) __cfq_exit_single_io_context(cfqd, cic); spin_unlock_irqrestore(q->queue_lock, flags); @@ -2629,7 +2725,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc) static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic) { - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); struct cfq_queue *cfqq; unsigned long flags; @@ -2686,7 +2782,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) { struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); - struct cfq_data *cfqd = cic->key; + struct cfq_data *cfqd = cic_to_cfqd(cic); unsigned long flags; struct request_queue *q; @@ -2823,12 +2919,13 @@ cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc, unsigned long flags; WARN_ON(!list_empty(&cic->queue_list)); + BUG_ON(cic->key != cfqd_dead_key(cfqd)); spin_lock_irqsave(&ioc->lock, flags); BUG_ON(ioc->ioc_data == cic); - radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd); + radix_tree_delete(&ioc->radix_root, cfqd->cic_index); hlist_del_rcu(&cic->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2840,7 +2937,6 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) { struct cfq_io_context *cic; unsigned long flags; - void *k; if (unlikely(!ioc)) return NULL; @@ -2857,13 +2953,11 @@ cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc) } do { - cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd); + cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index); rcu_read_unlock(); if (!cic) break; - /* ->key must be copied to avoid race with cfq_exit_queue() */ - k = cic->key; - if (unlikely(!k)) { + if (unlikely(cic->key != cfqd)) { cfq_drop_dead_cic(cfqd, ioc, cic); rcu_read_lock(); continue; @@ -2896,7 +2990,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc, spin_lock_irqsave(&ioc->lock, flags); ret = radix_tree_insert(&ioc->radix_root, - (unsigned long) cfqd, cic); + cfqd->cic_index, cic); if (!ret) hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list); spin_unlock_irqrestore(&ioc->lock, flags); @@ -2976,43 +3070,20 @@ static void cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct request *rq) { - sector_t sdist; - u64 total; - - if (!cfqq->last_request_pos) - sdist = 0; - else if (cfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - cfqq->last_request_pos; - else - sdist = cfqq->last_request_pos - blk_rq_pos(rq); + sector_t sdist = 0; + sector_t n_sec = blk_rq_sectors(rq); + if (cfqq->last_request_pos) { + if (cfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - cfqq->last_request_pos; + else + sdist = cfqq->last_request_pos - blk_rq_pos(rq); + } - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (cfqq->seek_samples <= 60) /* second&third seek */ - sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024); + cfqq->seek_history <<= 1; + if (blk_queue_nonrot(cfqd->queue)) + cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); else - sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64); - - cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8; - cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8; - total = cfqq->seek_total + (cfqq->seek_samples/2); - do_div(total, cfqq->seek_samples); - cfqq->seek_mean = (sector_t)total; - - /* - * If this cfqq is shared between multiple processes, check to - * make sure that those processes are still issuing I/Os within - * the mean seek distance. If not, it may be time to break the - * queues apart again. - */ - if (cfq_cfqq_coop(cfqq)) { - if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) - cfqq->seeky_start = jiffies; - else if (!CFQQ_SEEKY(cfqq)) - cfqq->seeky_start = 0; - } + cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); } /* @@ -3037,8 +3108,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_mark_cfqq_deep(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples) - && CFQQ_SEEKY(cfqq))) + (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { if (cic->ttime_mean > cfqd->cfq_slice_idle) @@ -3122,7 +3192,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, cfqq, rq, true)) + if (cfq_rq_close(cfqd, cfqq, rq)) return true; return false; @@ -3183,11 +3253,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfq_cfqq_wait_request(cfqq)) { if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || cfqd->busy_queues > 1) { - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); __blk_run_queue(cfqd->queue); - } else + } else { + cfq_blkiocg_update_idle_time_stats( + &cfqq->cfqg->blkg); cfq_mark_cfqq_must_dispatch(cfqq); + } } } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* @@ -3212,7 +3285,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); cfq_add_rq_rb(rq); - + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -3224,14 +3299,14 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; - if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth) - cfqd->hw_tag_est_depth = rq_in_driver(cfqd); + if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) + cfqd->hw_tag_est_depth = cfqd->rq_in_driver; if (cfqd->hw_tag == 1) return; if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) + cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) return; /* @@ -3241,7 +3316,7 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) */ if (cfqq && cfq_cfqq_idle_window(cfqq) && cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < - CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN) + CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) @@ -3294,13 +3369,15 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_update_hw_tag(cfqd); - WARN_ON(!cfqd->rq_in_driver[sync]); + WARN_ON(!cfqd->rq_in_driver); WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver[sync]--; + cfqd->rq_in_driver--; cfqq->dispatched--; + cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, + rq_start_time_ns(rq), rq_io_start_time_ns(rq), + rq_data_dir(rq), rq_is_sync(rq)); - if (cfq_cfqq_sync(cfqq)) - cfqd->sync_flight--; + cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { RQ_CIC(rq)->last_end_request = now; @@ -3327,6 +3404,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) if (cfq_should_wait_busy(cfqd, cfqq)) { cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; cfq_mark_cfqq_wait_busy(cfqq); + cfq_log_cfqq(cfqd, cfqq, "will busy wait"); } /* @@ -3354,7 +3432,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) } } - if (!rq_in_driver(cfqd)) + if (!cfqd->rq_in_driver) cfq_schedule_dispatch(cfqd); } @@ -3438,6 +3516,10 @@ static void cfq_put_request(struct request *rq) rq->elevator_private = NULL; rq->elevator_private2 = NULL; + /* Put down rq reference on cfqg */ + cfq_put_cfqg(RQ_CFQG(rq)); + rq->elevator_private3 = NULL; + cfq_put_queue(cfqq); } } @@ -3453,14 +3535,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, return cic_to_cfqq(cic, 1); } -static int should_split_cfqq(struct cfq_queue *cfqq) -{ - if (cfqq->seeky_start && - time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) - return 1; - return 0; -} - /* * Returns NULL if a new cfqq should be allocated, or the old cfqq if this * was the last process referring to said cfqq. @@ -3469,13 +3543,16 @@ static struct cfq_queue * split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) { if (cfqq_process_refs(cfqq) == 1) { - cfqq->seeky_start = 0; cfqq->pid = current->pid; cfq_clear_cfqq_coop(cfqq); + cfq_clear_cfqq_split_coop(cfqq); return cfqq; } cic_set_cfqq(cic, NULL, 1); + + cfq_put_cooperator(cfqq); + cfq_put_queue(cfqq); return NULL; } @@ -3510,7 +3587,7 @@ new_queue: /* * If the queue was seeky for too long, break it apart. */ - if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { + if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); cfqq = split_cfqq(cic, cfqq); if (!cfqq) @@ -3534,6 +3611,7 @@ new_queue: rq->elevator_private = cic; rq->elevator_private2 = cfqq; + rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg); return 0; queue_fail: @@ -3661,16 +3739,38 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_put_async_queues(cfqd); cfq_release_cfq_groups(cfqd); - blkiocg_del_blkio_group(&cfqd->root_group.blkg); + cfq_blkiocg_del_blkio_group(&cfqd->root_group.blkg); spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); + spin_lock(&cic_index_lock); + ida_remove(&cic_index_ida, cfqd->cic_index); + spin_unlock(&cic_index_lock); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ call_rcu(&cfqd->rcu, cfq_cfqd_free); } +static int cfq_alloc_cic_index(void) +{ + int index, error; + + do { + if (!ida_pre_get(&cic_index_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&cic_index_lock); + error = ida_get_new(&cic_index_ida, &index); + spin_unlock(&cic_index_lock); + if (error && error != -EAGAIN) + return error; + } while (error); + + return index; +} + static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; @@ -3678,10 +3778,16 @@ static void *cfq_init_queue(struct request_queue *q) struct cfq_group *cfqg; struct cfq_rb_root *st; + i = cfq_alloc_cic_index(); + if (i < 0) + return NULL; + cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; + cfqd->cic_index = i; + /* Init root service tree */ cfqd->grp_service_tree = CFQ_RB_ROOT; @@ -3700,8 +3806,10 @@ static void *cfq_init_queue(struct request_queue *q) * to make sure that cfq_put_cfqg() does not try to kfree root group */ atomic_set(&cfqg->ref, 1); - blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, - 0); + rcu_read_lock(); + cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, + (void *)cfqd, 0); + rcu_read_unlock(); #endif /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -3747,7 +3855,6 @@ static void *cfq_init_queue(struct request_queue *q) * second, in order to have larger depth for async operations. */ cfqd->last_delayed_sync = jiffies - HZ; - INIT_RCU_HEAD(&cfqd->rcu); return cfqd; } @@ -3876,6 +3983,7 @@ static struct elevator_type iosched_cfq = { .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, .elevator_allow_merge_fn = cfq_allow_merge, + .elevator_bio_merged_fn = cfq_bio_merged, .elevator_dispatch_fn = cfq_dispatch_requests, .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, @@ -3941,6 +4049,7 @@ static void __exit cfq_exit(void) */ if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); + ida_destroy(&cic_index_ida); cfq_slab_kill(); } diff --git a/block/cfq.h b/block/cfq.h new file mode 100644 index 000000000000..93448e5a2e41 --- /dev/null +++ b/block/cfq.h @@ -0,0 +1,115 @@ +#ifndef _CFQ_H +#define _CFQ_H +#include "blk-cgroup.h" + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) +{ + blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkiocg_update_dequeue_stats(blkg, dequeue); +} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) +{ + blkiocg_update_timeslice_used(blkg, time); +} + +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + blkiocg_set_start_empty_time(blkg); +} + +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_remove_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_merged_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_idle_time_stats(blkg); +} + +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + blkiocg_update_avg_queue_size_stats(blkg); +} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_set_idle_time_stats(blkg); +} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); +} + +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + blkiocg_update_completion_stats(blkg, start_time, io_start_time, + direction, sync); +} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) { + blkiocg_add_blkio_group(blkcg, blkg, key, dev); +} + +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return blkiocg_del_blkio_group(blkg); +} + +#else /* CFQ_GROUP_IOSCHED */ +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) {} +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ +} +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) {} +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return 0; +} + +#endif /* CFQ_GROUP_IOSCHED */ +#endif diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 4eb8e9ea4af5..f26051f44681 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -6,6 +6,7 @@ #include <linux/elevator.h> #include <linux/fd.h> #include <linux/hdreg.h> +#include <linux/slab.h> #include <linux/syscalls.h> #include <linux/smp_lock.h> #include <linux/types.h> diff --git a/block/elevator.c b/block/elevator.c index 9ad5ccc4c5ee..923a9139106c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -154,7 +154,7 @@ static struct elevator_type *elevator_get(const char *name) spin_unlock(&elv_list_lock); - sprintf(elv, "%s-iosched", name); + snprintf(elv, sizeof(elv), "%s-iosched", name); request_module("%s", elv); spin_lock(&elv_list_lock); @@ -242,9 +242,11 @@ int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; struct elevator_queue *eq; - int ret = 0; void *data; + if (unlikely(q->elevator)) + return 0; + INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->end_sector = 0; @@ -284,7 +286,7 @@ int elevator_init(struct request_queue *q, char *name) } elevator_attach(q, eq, data); - return ret; + return 0; } EXPORT_SYMBOL(elevator_init); @@ -474,6 +476,15 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) int ret; /* + * Levels of merges: + * nomerges: No merges at all attempted + * noxmerges: Only simple one-hit cache try + * merges: All merge tries attempted + */ + if (blk_queue_nomerges(q)) + return ELEVATOR_NO_MERGE; + + /* * First try one-hit cache. */ if (q->last_merge) { @@ -484,7 +495,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) } } - if (blk_queue_nomerges(q)) + if (blk_queue_noxmerges(q)) return ELEVATOR_NO_MERGE; /* @@ -530,6 +541,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, q->last_merge = rq; } +void elv_bio_merged(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e->ops->elevator_bio_merged_fn) + e->ops->elevator_bio_merged_fn(q, rq, bio); +} + void elv_requeue_request(struct request_queue *q, struct request *rq) { /* @@ -883,7 +903,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr, return error; } -static struct sysfs_ops elv_sysfs_ops = { +static const struct sysfs_ops elv_sysfs_ops = { .show = elv_attr_show, .store = elv_attr_store, }; @@ -912,6 +932,7 @@ int elv_register_queue(struct request_queue *q) } return error; } +EXPORT_SYMBOL(elv_register_queue); static void __elv_unregister_queue(struct elevator_queue *e) { @@ -924,6 +945,7 @@ void elv_unregister_queue(struct request_queue *q) if (q) __elv_unregister_queue(q->elevator); } +EXPORT_SYMBOL(elv_unregister_queue); void elv_register(struct elevator_type *e) { @@ -1077,7 +1099,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!q->elevator) + if (!q->elevator || !blk_queue_stackable(q)) return sprintf(name, "none\n"); elv = e->elevator_type; diff --git a/block/genhd.c b/block/genhd.c index d13ba76a169c..59a2db6fecef 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -596,6 +596,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) return disk; } +EXPORT_SYMBOL(get_gendisk); /** * bdget_disk - do bdget() by gendisk and partition number @@ -987,7 +988,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) if (!new_ptbl) return -ENOMEM; - INIT_RCU_HEAD(&new_ptbl->rcu_head); new_ptbl->len = target; for (i = 0; i < len; i++) diff --git a/block/ioctl.c b/block/ioctl.c index be48ea51faee..e8eb679f2f9b 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -1,5 +1,6 @@ #include <linux/capability.h> #include <linux/blkdev.h> +#include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> #include <linux/backing-dev.h> @@ -125,7 +126,7 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (bdev->bd_inode->i_size >> 9)) return -EINVAL; return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, - DISCARD_FL_WAIT); + BLKDEV_IFL_WAIT); } static int put_ushort(unsigned long arg, unsigned short val) diff --git a/block/noop-iosched.c b/block/noop-iosched.c index 3a0d369d08c7..232c4b38cd37 100644 --- a/block/noop-iosched.c +++ b/block/noop-iosched.c @@ -5,6 +5,7 @@ #include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/init.h> struct noop_data { |