From 7d35079f8277b653d6a3075eea9edd4dbf7c2b29 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:46 -0700 Subject: kernfs: use idr instead of ida to manage inode number kernfs uses ida to manage inode number. The problem is we can't get kernfs_node from inode number with ida. Switching to use idr, next patch will add an API to get kernfs_node from inode number. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kernfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index a9b11b8d06f2..5f5d602eb433 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -163,7 +163,7 @@ struct kernfs_root { unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ - struct ida ino_ida; + struct idr ino_idr; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ -- cgit v1.2.3 From 4a3ef68acacf31570066e69593de5cc49cc91638 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:47 -0700 Subject: kernfs: implement i_generation Set i_generation for kernfs inode. This is required to implement exportfs operations. The generation is 32-bit, so it's possible the generation wraps up and we find stale files. To reduce the posssibility, we don't reuse inode numer immediately. When the inode number allocation wraps, we increase generation number. In this way generation/inode number consist of a 64-bit number which is unlikely duplicated. This does make the idr tree more sparse and waste some memory. Since idr manages 32-bit keys, idr uses a 6-level radix tree, each level covers 6 bits of the key. In a 100k inode kernfs, the worst case will have around 300k radix tree node. Each node is 576bytes, so the tree will use about ~150M memory. Sounds not too bad, if this really is a problem, we should find better data structure. Acked-by: Tejun Heo Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kernfs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5f5d602eb433..8c00d28f468a 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -135,6 +135,7 @@ struct kernfs_node { umode_t mode; unsigned int ino; struct kernfs_iattrs *iattr; + u32 generation; }; /* @@ -164,6 +165,7 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; + u32 next_generation; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ -- cgit v1.2.3 From c53cd490b1a491ebf1d8e30da97e7231459a4208 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:50 -0700 Subject: kernfs: introduce kernfs_node_id inode number and generation can identify a kernfs node. We are going to export the identification by exportfs operations, so put ino and generation into a separate structure. It's convenient when later patches use the identification. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 2 +- include/linux/kernfs.h | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 710a005c6b7a..30c68773fd1e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -543,7 +543,7 @@ static inline bool cgroup_is_populated(struct cgroup *cgrp) /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { - return cgrp->kn->ino; + return cgrp->kn->id.ino; } /* cft/css accessors for cftype->write() operation */ diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 8c00d28f468a..06a0c5913e1d 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -95,6 +95,15 @@ struct kernfs_elem_attr { struct kernfs_node *notify_next; /* for kernfs_notify() */ }; +/* represent a kernfs node */ +union kernfs_node_id { + struct { + u32 ino; + u32 generation; + }; + u64 id; +}; + /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are @@ -131,11 +140,10 @@ struct kernfs_node { void *priv; + union kernfs_node_id id; unsigned short flags; umode_t mode; - unsigned int ino; struct kernfs_iattrs *iattr; - u32 generation; }; /* -- cgit v1.2.3 From aa8188253474b4053bc2900d9fcb545ce68bdf5c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:51 -0700 Subject: kernfs: add exportfs operations Now we have the facilities to implement exportfs operations. The idea is cgroup can export the fhandle info to userspace, then userspace uses fhandle to find the cgroup name. Another example is userspace can get fhandle for a cgroup and BPF uses the fhandle to filter info for the cgroup. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/kernfs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 06a0c5913e1d..d149361e5875 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -69,6 +69,12 @@ enum kernfs_root_flag { * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, + + /* + * The filesystem supports exportfs operation, so userspace can use + * fhandle to access nodes of the fs. + */ + KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, }; /* type-specific structures for kernfs_node union members */ @@ -98,6 +104,12 @@ struct kernfs_elem_attr { /* represent a kernfs node */ union kernfs_node_id { struct { + /* + * blktrace will export this struct as a simplified 'struct + * fid' (which is a big data struction), so userspace can use + * it to find kernfs node. The layout must match the first two + * fields of 'struct fid' exactly. + */ u32 ino; u32 generation; }; -- cgit v1.2.3 From 121508df44d074245a72eda6b067478218480a40 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:52 -0700 Subject: cgroup: export fhandle info for a cgroup Add an API to export cgroup fhandle info. We don't export a full 'struct file_handle', there are unrequired info. Sepcifically, cgroup is always a directory, so we don't need a 'FILEID_INO32_GEN_PARENT' type fhandle, we only need export the inode number and generation number just like what generic_fh_to_dentry does. And we can avoid the overhead of getting an inode too, since kernfs_node_id (ino and generation) has all the info required. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 30c68773fd1e..52ef9a68ff14 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -609,6 +609,10 @@ static inline void cgroup_kthread_ready(void) current->no_cgroup_migration = 0; } +static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +{ + return &cgrp->kn->id; +} #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -631,6 +635,10 @@ static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} +static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) +{ + return NULL; +} static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) -- cgit v1.2.3 From 007cc56b7eeca8848021bc43aca2b8607fbe5589 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:54 -0700 Subject: block: always attach cgroup info into bio blkcg_bio_issue_check() already gets blkcg for a BIO. bio_associate_blkcg() uses a percpu refcounter, so it's a very cheap operation. There is no point we don't attach the cgroup info into bio at blkcg_bio_issue_check. This also makes blktrace outputs correct cgroup info. Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 7104bea8dab1..9d92153dd856 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -691,6 +691,9 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, rcu_read_lock(); blkcg = bio_blkcg(bio); + /* associate blkcg if bio hasn't attached one */ + bio_associate_blkcg(bio, &blkcg->css); + blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { spin_lock_irq(q->queue_lock); -- cgit v1.2.3 From 69fd5c391763bd94a40dd152bc72a7f230137150 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:55 -0700 Subject: blktrace: add an option to allow displaying cgroup path By default we output cgroup id in blktrace. This adds an option to display cgroup path. Since get cgroup path is a relativly heavy operation, we don't enable it by default. with the option enabled, blktrace will output something like this: dd-1353 [007] d..2 293.015252: 8,0 /test/level D R 24 + 8 [dd] Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/cgroup.h | 6 ++++++ include/linux/kernfs.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 52ef9a68ff14..6144fe923b73 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -613,6 +613,9 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp) { return &cgrp->kn->id; } + +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; @@ -645,6 +648,9 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, { return true; } + +static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ /* diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index d149361e5875..ab25c8b6d9e3 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -358,6 +358,8 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); void kernfs_init(void); +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) -- cgit v1.2.3 From 35fe6d763229e8fc0eb5f9b93a401673cfcb5e1e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:56 -0700 Subject: block: use standard blktrace API to output cgroup info for debug notes Currently cfq/bfq/blk-throttle output cgroup info in trace in their own way. Now we have standard blktrace API for this, so convert them to use it. Note, this changes the behavior a little bit. cgroup info isn't output by default, we only do this with 'blk_cgroup' option enabled. cgroup info isn't output as a string by default too, we only do this with 'blk_cgname' option enabled. Also cgroup info is output in different position of the note string. I think these behavior changes aren't a big issue (actually we make trace data shorter which is good), since the blktrace note is solely for debugging. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index d2e908586e3d..67b4d4dfc19c 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -28,10 +28,12 @@ struct blk_trace { atomic_t dropped; }; +struct blkcg; + extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern __printf(2, 3) -void __trace_note_message(struct blk_trace *, const char *fmt, ...); +extern __printf(3, 4) +void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *fmt, ...); /** * blk_add_trace_msg - Add a (simple) message to the blktrace stream @@ -46,12 +48,14 @@ void __trace_note_message(struct blk_trace *, const char *fmt, ...); * NOTE: Can not use 'static inline' due to presence of var args... * **/ -#define blk_add_trace_msg(q, fmt, ...) \ +#define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ do { \ struct blk_trace *bt = (q)->blk_trace; \ if (unlikely(bt)) \ - __trace_note_message(bt, fmt, ##__VA_ARGS__); \ + __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ } while (0) +#define blk_add_trace_msg(q, fmt, ...) \ + blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__) #define BLK_TN_MAX_MSG 128 static inline bool blk_trace_note_message_enabled(struct request_queue *q) @@ -82,6 +86,7 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) +# define blk_add_cgroup_trace_msg(q, cg, fmt, ...) do { } while (0) # define blk_trace_remove_sysfs(dev) do { } while (0) # define blk_trace_note_message_enabled(q) (false) static inline int blk_trace_init_sysfs(struct device *dev) -- cgit v1.2.3 From d62e26b3ffd28f16ddae85a1babd0303a1a6dfb6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 30 Jun 2017 21:55:08 -0600 Subject: block: pass in queue to inflight accounting No functional change in this patch, just in preparation for basing the inflight mechanism on the queue in question. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 +++++---- include/linux/genhd.h | 14 +++++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7b1cf4ba0902..9276788a9b24 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -463,10 +463,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -void generic_start_io_acct(int rw, unsigned long sectors, - struct hd_struct *part); -void generic_end_io_acct(int rw, struct hd_struct *part, - unsigned long start_time); +void generic_start_io_acct(struct request_queue *q, int rw, + unsigned long sectors, struct hd_struct *part); +void generic_end_io_acct(struct request_queue *q, int rw, + struct hd_struct *part, + unsigned long start_time); #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" diff --git a/include/linux/genhd.h b/include/linux/genhd.h index e619fae2f037..7f7427e00f9c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -362,23 +362,27 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_sub(cpu, gendiskp, field, subnd) \ part_stat_add(cpu, gendiskp, field, -subnd) -static inline void part_inc_in_flight(struct hd_struct *part, int rw) +static inline void part_inc_in_flight(struct request_queue *q, + struct hd_struct *part, int rw) { atomic_inc(&part->in_flight[rw]); if (part->partno) atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); } -static inline void part_dec_in_flight(struct hd_struct *part, int rw) +static inline void part_dec_in_flight(struct request_queue *q, + struct hd_struct *part, int rw) { atomic_dec(&part->in_flight[rw]); if (part->partno) atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); } -static inline int part_in_flight(struct hd_struct *part) +static inline int part_in_flight(struct request_queue *q, + struct hd_struct *part) { - return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]); + return atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); } static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) @@ -395,7 +399,7 @@ static inline void free_part_info(struct hd_struct *part) } /* block/blk-core.c */ -extern void part_round_stats(int cpu, struct hd_struct *part); +extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part); /* block/genhd.c */ extern void device_add_disk(struct device *parent, struct gendisk *disk); -- cgit v1.2.3 From 0609e0efc5e15195ecf8c6d2f2e890d98760e337 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Aug 2017 17:49:47 -0600 Subject: block: make part_in_flight() take an array of two ints Instead of returning the count that matches the partition, pass in an array of two ints. Index 0 will be filled with the inflight count for the partition in question, and index 1 will filled with the root inflight count, if the partition passed in is not the root. This is in preparation for being able to calculate both in one go. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/genhd.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7f7427e00f9c..f2a3a26cdda1 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -378,11 +378,18 @@ static inline void part_dec_in_flight(struct request_queue *q, atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); } -static inline int part_in_flight(struct request_queue *q, - struct hd_struct *part) +static inline void part_in_flight(struct request_queue *q, + struct hd_struct *part, + unsigned int inflight[2]) { - return atomic_read(&part->in_flight[0]) + + inflight[0] = atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]); + if (part->partno) { + part = &part_to_disk(part)->part0; + inflight[1] = atomic_read(&part->in_flight[0]) + + atomic_read(&part->in_flight[1]); + } else + inflight[1] = 0; } static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) -- cgit v1.2.3 From f299b7c7a9dee64425e5965bd4f56dc024c1befc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 8 Aug 2017 17:51:45 -0600 Subject: blk-mq: provide internal in-flight variant We don't have to inc/dec some counter, since we can just iterate the tags. That makes inc/dec a noop, but means we have to iterate busy tags to get an in-flight count. Reviewed-by: Bart Van Assche Reviewed-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/genhd.h | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f2a3a26cdda1..ea652bfcd675 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -362,35 +362,12 @@ static inline void free_part_stats(struct hd_struct *part) #define part_stat_sub(cpu, gendiskp, field, subnd) \ part_stat_add(cpu, gendiskp, field, -subnd) -static inline void part_inc_in_flight(struct request_queue *q, - struct hd_struct *part, int rw) -{ - atomic_inc(&part->in_flight[rw]); - if (part->partno) - atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); -} - -static inline void part_dec_in_flight(struct request_queue *q, - struct hd_struct *part, int rw) -{ - atomic_dec(&part->in_flight[rw]); - if (part->partno) - atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); -} - -static inline void part_in_flight(struct request_queue *q, - struct hd_struct *part, - unsigned int inflight[2]) -{ - inflight[0] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); - if (part->partno) { - part = &part_to_disk(part)->part0; - inflight[1] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); - } else - inflight[1] = 0; -} +void part_in_flight(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); +void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); +void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, + int rw); static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk) { -- cgit v1.2.3 From e743eb1ecd5564b5ae0a4a76c1566f748a358839 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Aug 2017 08:25:38 -0600 Subject: block: remove unused syncfull/asyncfull queue flags We haven't used these in years, but somehow the definitions still remained. Kill them, and renumber the QUEUE_FLAG_ space. We had a hole in the beginning of the space, too. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 60 ++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25f6a0cb27d3..f45f157b2910 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -601,38 +601,36 @@ struct request_queue { u64 write_hints[BLK_MAX_WRITE_HINTS]; }; -#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -#define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ -#define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ -#define QUEUE_FLAG_ASYNCFULL 4 /* write queue has been filled */ -#define QUEUE_FLAG_DYING 5 /* queue being torn down */ -#define QUEUE_FLAG_BYPASS 6 /* act as dumb FIFO queue */ -#define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ -#define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ -#define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ -#define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ -#define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_QUEUED 0 /* uses generic tag queueing */ +#define QUEUE_FLAG_STOPPED 1 /* queue is stopped */ +#define QUEUE_FLAG_DYING 2 /* queue being torn down */ +#define QUEUE_FLAG_BYPASS 3 /* act as dumb FIFO queue */ +#define QUEUE_FLAG_BIDI 4 /* queue supports bidi requests */ +#define QUEUE_FLAG_NOMERGES 5 /* disable merge attempts */ +#define QUEUE_FLAG_SAME_COMP 6 /* complete on same CPU-group */ +#define QUEUE_FLAG_FAIL_IO 7 /* fake timeout */ +#define QUEUE_FLAG_STACKABLE 8 /* supports request stacking */ +#define QUEUE_FLAG_NONROT 9 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ -#define QUEUE_FLAG_IO_STAT 13 /* do IO stats */ -#define QUEUE_FLAG_DISCARD 14 /* supports DISCARD */ -#define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ -#define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ -#define QUEUE_FLAG_SECERASE 17 /* supports secure erase */ -#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ -#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ -#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ -#define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/ -#define QUEUE_FLAG_POLL 22 /* IO polling enabled if set */ -#define QUEUE_FLAG_WC 23 /* Write back caching */ -#define QUEUE_FLAG_FUA 24 /* device supports FUA writes */ -#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ -#define QUEUE_FLAG_DAX 26 /* device supports DAX */ -#define QUEUE_FLAG_STATS 27 /* track rq completion times */ -#define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ -#define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ -#define QUEUE_FLAG_SCSI_PASSTHROUGH 30 /* queue supports SCSI commands */ -#define QUEUE_FLAG_QUIESCED 31 /* queue has been quiesced */ +#define QUEUE_FLAG_IO_STAT 10 /* do IO stats */ +#define QUEUE_FLAG_DISCARD 11 /* supports DISCARD */ +#define QUEUE_FLAG_NOXMERGES 12 /* No extended merges */ +#define QUEUE_FLAG_ADD_RANDOM 13 /* Contributes to random pool */ +#define QUEUE_FLAG_SECERASE 14 /* supports secure erase */ +#define QUEUE_FLAG_SAME_FORCE 15 /* force complete on same CPU */ +#define QUEUE_FLAG_DEAD 16 /* queue tear-down finished */ +#define QUEUE_FLAG_INIT_DONE 17 /* queue is initialized */ +#define QUEUE_FLAG_NO_SG_MERGE 18 /* don't attempt to merge SG segments*/ +#define QUEUE_FLAG_POLL 19 /* IO polling enabled if set */ +#define QUEUE_FLAG_WC 20 /* Write back caching */ +#define QUEUE_FLAG_FUA 21 /* device supports FUA writes */ +#define QUEUE_FLAG_FLUSH_NQ 22 /* flush not queueuable */ +#define QUEUE_FLAG_DAX 23 /* device supports DAX */ +#define QUEUE_FLAG_STATS 24 /* track rq completion times */ +#define QUEUE_FLAG_POLL_STATS 25 /* collecting stats for hybrid polling */ +#define QUEUE_FLAG_REGISTERED 26 /* queue has been registered to a disk */ +#define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */ +#define QUEUE_FLAG_QUIESCED 28 /* queue has been quiesced */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From d352ae205d8b05f3f7558d10f474d8436581b3e2 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 16:23:03 -0700 Subject: blk-mq: Make blk_mq_reinit_tagset() calls easier to read Since blk_mq_ops.reinit_request is only called from inside blk_mq_reinit_tagset(), make this function pointer an argument of blk_mq_reinit_tagset() instead of a member of struct blk_mq_ops. This patch does not change any functionality but makes blk_mq_reinit_tagset() calls easier to read and to analyze. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: James Smart Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 14542308d25b..50c6485cb04f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -97,7 +97,6 @@ typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, unsigned int); -typedef int (reinit_request_fn)(void *, struct request *); typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *, bool); @@ -143,7 +142,6 @@ struct blk_mq_ops { */ init_request_fn *init_request; exit_request_fn *exit_request; - reinit_request_fn *reinit_request; /* Called from inside blk_get_request() */ void (*initialize_rq_fn)(struct request *rq); @@ -261,7 +259,8 @@ void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); +int blk_mq_reinit_tagset(struct blk_mq_tag_set *set, + int (reinit_request)(void *, struct request *)); int blk_mq_map_queues(struct blk_mq_tag_set *set); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); -- cgit v1.2.3 From c2ee070fb00365d7841f6661dcdc7fbe6620bdf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:31 +0200 Subject: block: cache the partition index in struct block_device Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e1fd5d21248..706dd3a972d2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -427,6 +427,7 @@ struct block_device { #endif struct block_device * bd_contains; unsigned bd_block_size; + u8 bd_partno; struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; -- cgit v1.2.3 From 74d46992e0d9dee7f1f376de0d56d31614c8a17a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:32 +0200 Subject: block: replace bi_bdev with a gendisk pointer and partitions index This way we don't need a block_device structure to submit I/O. The block_device has different life time rules from the gendisk and request_queue and is usually only available when the block device node is open. Other callers need to explicitly create one (e.g. the lightnvm passthrough code, or the new nvme multipathing code). For the actual I/O path all that we need is the gendisk, which exists once per block device. But given that the block layer also does partition remapping we additionally need a partition index, which is used for said remapping in generic_make_request. Note that all the block drivers generally want request_queue or sometimes the gendisk, so this removes a layer of indirection all over the stack. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 18 ++++++++++++++++++ include/linux/blk_types.h | 3 ++- 2 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9276788a9b24..a8fe7935332f 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -494,6 +494,24 @@ extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *); extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); +#define bio_set_dev(bio, bdev) \ +do { \ + (bio)->bi_disk = (bdev)->bd_disk; \ + (bio)->bi_partno = (bdev)->bd_partno; \ +} while (0) + +#define bio_copy_dev(dst, src) \ +do { \ + (dst)->bi_disk = (src)->bi_disk; \ + (dst)->bi_partno = (src)->bi_partno; \ +} while (0) + +#define bio_dev(bio) \ + disk_devt((bio)->bi_disk) + +#define bio_devname(bio, buf) \ + __bdevname(bio_dev(bio), (buf)) + #ifdef CONFIG_BLK_CGROUP int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d2eb87c84d82..a2d2aa709cef 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -48,7 +48,8 @@ struct blk_issue_stat { */ struct bio { struct bio *bi_next; /* request queue link */ - struct block_device *bi_bdev; + struct gendisk *bi_disk; + u8 bi_partno; blk_status_t bi_status; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use -- cgit v1.2.3 From 9de7e14a1a9c6bc4f9be6ccd9b951341a80dbd52 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 29 Aug 2017 10:20:38 +0200 Subject: drbd: new disk-option disable-write-same Some backend devices claim to support write-same, but would fail actual write-same requests. Allow to set (or toggle) whether or not DRBD tries to support write-same. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd_genl.h | 3 ++- include/linux/drbd_limits.h | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 2896f93808ae..4e6d4d4c7056 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -132,7 +132,8 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) - __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED) + __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF) + __flg_field_def(26, 0 /* OPTIONAL */, disable_write_same, DRBD_DISABLE_WRITE_SAME_DEF) ) GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index ddac68422a96..24ae1b9b76c7 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -209,12 +209,18 @@ #define DRBD_MD_FLUSHES_DEF 1 #define DRBD_TCP_CORK_DEF 1 #define DRBD_AL_UPDATES_DEF 1 + /* We used to ignore the discard_zeroes_data setting. * To not change established (and expected) behaviour, * by default assume that, for discard_zeroes_data=0, * we can make that an effective discard_zeroes_data=1, * if we only explicitly zero-out unaligned partial chunks. */ -#define DRBD_DISCARD_ZEROES_IF_ALIGNED 1 +#define DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF 1 + +/* Some backends pretend to support WRITE SAME, + * but fail such requests when they are actually submitted. + * This is to tell DRBD to not even try. */ +#define DRBD_DISABLE_WRITE_SAME_DEF 0 #define DRBD_ALLOW_TWO_PRIMARIES_DEF 0 #define DRBD_ALWAYS_ASBP_DEF 0 -- cgit v1.2.3 From 365cf663b64791e341f425385c7ae152327c7009 Mon Sep 17 00:00:00 2001 From: Roland Kammerer Date: Tue, 29 Aug 2017 10:20:48 +0200 Subject: drbd: switch from kmalloc() to kmalloc_array() We had one call to kmalloc that actually allocates an array. Switch that one to the kmalloc_array() function. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 002611c85318..2d0259327721 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -51,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.7" +#define REL_VERSION "8.4.10" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 -- cgit v1.2.3