From aa8188253474b4053bc2900d9fcb545ce68bdf5c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:51 -0700 Subject: kernfs: add exportfs operations Now we have the facilities to implement exportfs operations. The idea is cgroup can export the fhandle info to userspace, then userspace uses fhandle to find the cgroup name. Another example is userspace can get fhandle for a cgroup and BPF uses the fhandle to filter info for the cgroup. Acked-by: Greg Kroah-Hartman Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a20a33..6cefa277f39c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1737,7 +1737,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops; root->kf_root = kernfs_create_root(kf_sops, - KERNFS_ROOT_CREATE_DEACTIVATED, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_SUPPORT_EXPORTOP, root_cgrp); if (IS_ERR(root->kf_root)) { ret = PTR_ERR(root->kf_root); -- cgit v1.2.3 From ca1136c99b66b1566781ff12ecddc635d570f932 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:53 -0700 Subject: blktrace: export cgroup info in trace Currently blktrace isn't cgroup aware. blktrace prints out task name of current context, but the task of current context isn't always in the cgroup where the BIO comes from. We can't use task name to find out IO cgroup. For example, Writeback BIOs always comes from flusher thread but the BIOs are for different blk cgroups. Request could be requeued and dispatched from completely different tasks. MD/DM are another examples. This patch tries to fix the gap. We print out cgroup fhandle info in blktrace. Userspace can use open_by_handle_at() syscall to find the cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to find fhandle for a cgroup and use a BPF program to filter out blktrace for a specific cgroup. We add a new 'blk_cgroup' trace option for blk tracer. It's default off. Application which doesn't know the new option isn't affected. When it's on, we output fhandle info right after blk_io_trace with an extra bit set in event action. So from application point of view, blktrace with the option will output new actions. I didn't change blk trace event yet, since I'm not sure if changing the trace event output is an ABI issue. If not, I'll do it later. Acked-by: Steven Rostedt (VMware) Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 231 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 158 insertions(+), 73 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index bc364f86100a..f393d7a43695 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "../../block/blk.h" @@ -46,10 +47,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 +#define TRACE_BLK_OPT_CGROUP 0x2 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, +#ifdef CONFIG_BLK_CGROUP + { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, +#endif { } }; @@ -68,7 +73,8 @@ static void blk_unregister_tracepoints(void); * Send out a notify message. */ static void trace_note(struct blk_trace *bt, pid_t pid, int action, - const void *data, size_t len) + const void *data, size_t len, + union kernfs_node_id *cgid) { struct blk_io_trace *t; struct ring_buffer_event *event = NULL; @@ -76,12 +82,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, int pc = 0; int cpu = smp_processor_id(); bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (blk_tracer) { buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + len, + sizeof(*t) + len + cgid_len, 0, pc); if (!event) return; @@ -92,17 +99,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, if (!bt->rchan) return; - t = relay_reserve(bt->rchan, sizeof(*t) + len); + t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len); if (t) { t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; t->time = ktime_to_ns(ktime_get()); record_it: t->device = bt->dev; - t->action = action; + t->action = action | (cgid ? __BLK_TN_CGROUP : 0); t->pid = pid; t->cpu = cpu; - t->pdu_len = len; - memcpy((void *) t + sizeof(*t), data, len); + t->pdu_len = len + cgid_len; + if (cgid) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); + memcpy((void *) t + sizeof(*t) + cgid_len, data, len); if (blk_tracer) trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -122,7 +131,7 @@ static void trace_note_tsk(struct task_struct *tsk) spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, - sizeof(tsk->comm)); + sizeof(tsk->comm), NULL); } spin_unlock_irqrestore(&running_trace_lock, flags); } @@ -139,7 +148,7 @@ static void trace_note_time(struct blk_trace *bt) words[1] = now.tv_nsec; local_irq_save(flags); - trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words), NULL); local_irq_restore(flags); } @@ -167,7 +176,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); - trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -204,7 +213,7 @@ static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), */ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, int op, int op_flags, u32 what, int error, int pdu_len, - void *pdu_data) + void *pdu_data, union kernfs_node_id *cgid) { struct task_struct *tsk = current; struct ring_buffer_event *event = NULL; @@ -215,6 +224,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, pid_t pid; int cpu, pc = 0; bool blk_tracer = blk_tracer_enabled; + ssize_t cgid_len = cgid ? sizeof(*cgid) : 0; if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) return; @@ -229,6 +239,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= BLK_TC_ACT(BLK_TC_DISCARD); if (op == REQ_OP_FLUSH) what |= BLK_TC_ACT(BLK_TC_FLUSH); + if (cgid) + what |= __BLK_TA_CGROUP; pid = tsk->pid; if (act_log_check(bt, what, sector, pid)) @@ -241,7 +253,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, - sizeof(*t) + pdu_len, + sizeof(*t) + pdu_len + cgid_len, 0, pc); if (!event) return; @@ -258,7 +270,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, * from coming in and stepping on our toes. */ local_irq_save(flags); - t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len + cgid_len); if (t) { sequence = per_cpu_ptr(bt->sequence, cpu); @@ -280,10 +292,12 @@ record_it: t->action = what; t->device = bt->dev; t->error = error; - t->pdu_len = pdu_len; + t->pdu_len = pdu_len + cgid_len; + if (cgid_len) + memcpy((void *)t + sizeof(*t), cgid, cgid_len); if (pdu_len) - memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + memcpy((void *)t + sizeof(*t) + cgid_len, pdu_data, pdu_len); if (blk_tracer) { trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc); @@ -684,6 +698,36 @@ void blk_trace_shutdown(struct request_queue *q) } } +#ifdef CONFIG_BLK_CGROUP +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + struct blk_trace *bt = q->blk_trace; + + if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + return NULL; + + if (!bio->bi_css) + return NULL; + return cgroup_get_kernfs_id(bio->bi_css->cgroup); +} +#else +static union kernfs_node_id * +blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) +{ + return NULL; +} +#endif + +static union kernfs_node_id * +blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +{ + if (!rq->bio) + return NULL; + /* Use the first bio */ + return blk_trace_bio_get_cgid(q, rq->bio); +} + /* * blktrace probes */ @@ -694,13 +738,15 @@ void blk_trace_shutdown(struct request_queue *q) * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action + * @cgid: the cgroup info * * Description: * Records an action against a request. Will log the bio offset + size. * **/ static void blk_add_trace_rq(struct request *rq, int error, - unsigned int nr_bytes, u32 what) + unsigned int nr_bytes, u32 what, + union kernfs_node_id *cgid) { struct blk_trace *bt = rq->q->blk_trace; @@ -713,32 +759,36 @@ static void blk_add_trace_rq(struct request *rq, int error, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, error, 0, NULL); + rq->cmd_flags, what, error, 0, NULL, cgid); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, + blk_trace_request_get_cgid(q, rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { - blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, + blk_trace_request_get_cgid(rq->q, rq)); } /** @@ -753,7 +803,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error) + u32 what, int error, union kernfs_node_id *cgid) { struct blk_trace *bt = q->blk_trace; @@ -761,20 +811,22 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -782,7 +834,8 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -790,13 +843,15 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_getrq(void *ignore, @@ -804,13 +859,14 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL); + NULL, NULL); } } @@ -820,13 +876,14 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, + blk_trace_bio_get_cgid(q, bio)); else { struct blk_trace *bt = q->blk_trace; if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL); + 0, 0, NULL, NULL); } } @@ -835,7 +892,7 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) struct blk_trace *bt = q->blk_trace; if (bt) - __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); + __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, @@ -852,7 +909,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, else what = BLK_TA_UNPLUG_TIMER; - __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); } } @@ -868,7 +925,7 @@ static void blk_add_trace_split(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), - &rpdu); + &rpdu, blk_trace_bio_get_cgid(q, bio)); } } @@ -901,7 +958,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, - sizeof(r), &r); + sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); } /** @@ -934,7 +991,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r); + sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); } /** @@ -958,7 +1015,8 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, 0, len, data); + BLK_TA_DRV_DATA, 0, len, data, + blk_trace_request_get_cgid(q, rq)); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1031,7 +1089,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) int i = 0; int tc = t->action >> BLK_TC_SHIFT; - if (t->action == BLK_TN_MESSAGE) { + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { rwbs[i++] = 'N'; goto out; } @@ -1066,9 +1124,21 @@ const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) return (const struct blk_io_trace *)ent; } -static inline const void *pdu_start(const struct trace_entry *ent) +static inline const void *pdu_start(const struct trace_entry *ent, bool has_cg) { - return te_blk_io_trace(ent) + 1; + return (void *)(te_blk_io_trace(ent) + 1) + + (has_cg ? sizeof(union kernfs_node_id) : 0); +} + +static inline const void *cgid_start(const struct trace_entry *ent) +{ + return (void *)(te_blk_io_trace(ent) + 1); +} + +static inline int pdu_real_len(const struct trace_entry *ent, bool has_cg) +{ + return te_blk_io_trace(ent)->pdu_len - + (has_cg ? sizeof(union kernfs_node_id) : 0); } static inline u32 t_action(const struct trace_entry *ent) @@ -1096,16 +1166,16 @@ static inline __u16 t_error(const struct trace_entry *ent) return te_blk_io_trace(ent)->error; } -static __u64 get_pdu_int(const struct trace_entry *ent) +static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg) { - const __u64 *val = pdu_start(ent); + const __u64 *val = pdu_start(ent, has_cg); return be64_to_cpu(*val); } static void get_pdu_remap(const struct trace_entry *ent, - struct blk_io_trace_remap *r) + struct blk_io_trace_remap *r, bool has_cg) { - const struct blk_io_trace_remap *__r = pdu_start(ent); + const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg); __u64 sector_from = __r->sector_from; r->device_from = be32_to_cpu(__r->device_from); @@ -1113,9 +1183,11 @@ static void get_pdu_remap(const struct trace_entry *ent, r->sector_from = be64_to_cpu(sector_from); } -typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act); +typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act, + bool has_cg); -static void blk_log_action_classic(struct trace_iterator *iter, const char *act) +static void blk_log_action_classic(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; unsigned long long ts = iter->ts; @@ -1131,24 +1203,33 @@ static void blk_log_action_classic(struct trace_iterator *iter, const char *act) secs, nsec_rem, iter->ent->pid, act, rwbs); } -static void blk_log_action(struct trace_iterator *iter, const char *act) +static void blk_log_action(struct trace_iterator *iter, const char *act, + bool has_cg) { char rwbs[RWBS_LEN]; const struct blk_io_trace *t = te_blk_io_trace(iter->ent); fill_rwbs(rwbs, t); - trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", - MAJOR(t->device), MINOR(t->device), act, rwbs); + if (has_cg) { + const union kernfs_node_id *id = cgid_start(iter->ent); + + trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + MAJOR(t->device), MINOR(t->device), + id->ino, id->generation, act, rwbs); + } else + trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); } -static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_dump_pdu(struct trace_seq *s, + const struct trace_entry *ent, bool has_cg) { const unsigned char *pdu_buf; int pdu_len; int i, end; - pdu_buf = pdu_start(ent); - pdu_len = te_blk_io_trace(ent)->pdu_len; + pdu_buf = pdu_start(ent, has_cg); + pdu_len = pdu_real_len(ent, has_cg); if (!pdu_len) return; @@ -1179,7 +1260,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) trace_seq_puts(s, ") "); } -static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1187,7 +1268,7 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { trace_seq_printf(s, "%u ", t_bytes(ent)); - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%s]\n", cmd); } else { if (t_sec(ent)) @@ -1199,10 +1280,10 @@ static void blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) } static void blk_log_with_error(struct trace_seq *s, - const struct trace_entry *ent) + const struct trace_entry *ent, bool has_cg) { if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { - blk_log_dump_pdu(s, ent); + blk_log_dump_pdu(s, ent, has_cg); trace_seq_printf(s, "[%d]\n", t_error(ent)); } else { if (t_sec(ent)) @@ -1215,18 +1296,18 @@ static void blk_log_with_error(struct trace_seq *s, } } -static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { struct blk_io_trace_remap r = { .device_from = 0, }; - get_pdu_remap(ent, &r); + get_pdu_remap(ent, &r, has_cg); trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", t_sector(ent), t_sec(ent), MAJOR(r.device_from), MINOR(r.device_from), (unsigned long long)r.sector_from); } -static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; @@ -1235,30 +1316,31 @@ static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) trace_seq_printf(s, "[%s]\n", cmd); } -static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); + trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent, has_cg)); } -static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_split(struct trace_seq *s, const struct trace_entry *ent, bool has_cg) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), - get_pdu_int(ent), cmd); + get_pdu_int(ent, has_cg), cmd); } -static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +static void blk_log_msg(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg) { - const struct blk_io_trace *t = te_blk_io_trace(ent); - trace_seq_putmem(s, t + 1, t->pdu_len); + trace_seq_putmem(s, pdu_start(ent, has_cg), + pdu_real_len(ent, has_cg)); trace_seq_putc(s, '\n'); } @@ -1298,7 +1380,8 @@ static void blk_tracer_reset(struct trace_array *tr) static const struct { const char *act[2]; - void (*print)(struct trace_seq *s, const struct trace_entry *ent); + void (*print)(struct trace_seq *s, const struct trace_entry *ent, + bool has_cg); } what2act[] = { [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, @@ -1326,23 +1409,25 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, u16 what; bool long_act; blk_log_action_t *log_action; + bool has_cg; t = te_blk_io_trace(iter->ent); - what = t->action & ((1 << BLK_TC_SHIFT) - 1); + what = (t->action & ((1 << BLK_TC_SHIFT) - 1)) & ~__BLK_TA_CGROUP; long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE); log_action = classic ? &blk_log_action_classic : &blk_log_action; + has_cg = t->action & __BLK_TA_CGROUP; - if (t->action == BLK_TN_MESSAGE) { - log_action(iter, long_act ? "message" : "m"); - blk_log_msg(s, iter->ent); + if ((t->action & ~__BLK_TN_CGROUP) == BLK_TN_MESSAGE) { + log_action(iter, long_act ? "message" : "m", has_cg); + blk_log_msg(s, iter->ent, has_cg); return trace_handle_return(s); } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) trace_seq_printf(s, "Unknown action %x\n", what); else { - log_action(iter, what2act[what].act[long_act]); - what2act[what].print(s, iter->ent); + log_action(iter, what2act[what].act[long_act], has_cg); + what2act[what].print(s, iter->ent, has_cg); } return trace_handle_return(s); -- cgit v1.2.3 From 69fd5c391763bd94a40dd152bc72a7f230137150 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:55 -0700 Subject: blktrace: add an option to allow displaying cgroup path By default we output cgroup id in blktrace. This adds an option to display cgroup path. Since get cgroup path is a relativly heavy operation, we don't enable it by default. with the option enabled, blktrace will output something like this: dd-1353 [007] d..2 293.015252: 8,0 /test/level D R 24 + 8 [dd] Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/cgroup/cgroup.c | 12 ++++++++++++ kernel/trace/blktrace.c | 14 +++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6cefa277f39c..2aba1c519138 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4701,6 +4701,18 @@ static int __init cgroup_wq_init(void) } core_initcall(cgroup_wq_init); +void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, + char *buf, size_t buflen) +{ + struct kernfs_node *kn; + + kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id); + if (!kn) + return; + kernfs_path(kn, buf, buflen); + kernfs_put(kn); +} + /* * proc_cgroup_show() * - Print task's cgroup paths into seq_file, one line for each hierarchy diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f393d7a43695..e90974ed4532 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -48,12 +48,14 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 #define TRACE_BLK_OPT_CGROUP 0x2 +#define TRACE_BLK_OPT_CGNAME 0x4 static struct tracer_opt blk_tracer_opts[] = { /* Default disable the minimalistic output */ { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, #ifdef CONFIG_BLK_CGROUP { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) }, + { TRACER_OPT(blk_cgname, TRACE_BLK_OPT_CGNAME) }, #endif { } }; @@ -1213,7 +1215,17 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, if (has_cg) { const union kernfs_node_id *id = cgid_start(iter->ent); - trace_seq_printf(&iter->seq, "%3d,%-3d %x,%-x %2s %3s ", + if (blk_tracer_flags.val & TRACE_BLK_OPT_CGNAME) { + char blkcg_name_buf[NAME_MAX + 1] = "<...>"; + + cgroup_path_from_kernfs_id(id, blkcg_name_buf, + sizeof(blkcg_name_buf)); + trace_seq_printf(&iter->seq, "%3d,%-3d %s %2s %3s ", + MAJOR(t->device), MINOR(t->device), + blkcg_name_buf, act, rwbs); + } else + trace_seq_printf(&iter->seq, + "%3d,%-3d %x,%-x %2s %3s ", MAJOR(t->device), MINOR(t->device), id->ino, id->generation, act, rwbs); } else -- cgit v1.2.3 From 35fe6d763229e8fc0eb5f9b93a401673cfcb5e1e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 12 Jul 2017 11:49:56 -0700 Subject: block: use standard blktrace API to output cgroup info for debug notes Currently cfq/bfq/blk-throttle output cgroup info in trace in their own way. Now we have standard blktrace API for this, so convert them to use it. Note, this changes the behavior a little bit. cgroup info isn't output by default, we only do this with 'blk_cgroup' option enabled. cgroup info isn't output as a string by default too, we only do this with 'blk_cgname' option enabled. Also cgroup info is output in different position of the note string. I think these behavior changes aren't a big issue (actually we make trace data shorter which is good), since the blktrace note is solely for debugging. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e90974ed4532..7724de18d2fe 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -154,7 +154,8 @@ static void trace_note_time(struct blk_trace *bt) local_irq_restore(flags); } -void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +void __trace_note_message(struct blk_trace *bt, struct blkcg *blkcg, + const char *fmt, ...) { int n; va_list args; @@ -178,7 +179,14 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); va_end(args); + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) + blkcg = NULL; +#ifdef CONFIG_BLK_CGROUP + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, + blkcg ? cgroup_get_kernfs_id(blkcg->css.cgroup) : NULL); +#else trace_note(bt, 0, BLK_TN_MESSAGE, buf, n, NULL); +#endif local_irq_restore(flags); } EXPORT_SYMBOL_GPL(__trace_note_message); @@ -375,7 +383,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return PTR_ERR(msg); bt = filp->private_data; - __trace_note_message(bt, "%s", msg); + __trace_note_message(bt, NULL, "%s", msg); kfree(msg); return count; -- cgit v1.2.3 From 74d46992e0d9dee7f1f376de0d56d31614c8a17a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Aug 2017 19:10:32 +0200 Subject: block: replace bi_bdev with a gendisk pointer and partitions index This way we don't need a block_device structure to submit I/O. The block_device has different life time rules from the gendisk and request_queue and is usually only available when the block device node is open. Other callers need to explicitly create one (e.g. the lightnvm passthrough code, or the new nvme multipathing code). For the actual I/O path all that we need is the gendisk, which exists once per block device. But given that the block layer also does partition remapping we additionally need a partition index, which is used for said remapping in generic_make_request. Note that all the block drivers generally want request_queue or sometimes the gendisk, so this removes a layer of indirection all over the stack. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/power/swap.c | 5 ++--- kernel/trace/blktrace.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 57d22571f306..d7cdc426ee38 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -242,8 +242,7 @@ static void hib_end_io(struct bio *bio) if (bio->bi_status) { printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } @@ -270,7 +269,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr, bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = hib_resume_bdev; + bio_set_dev(bio, hib_resume_bdev); bio_set_op_attrs(bio, op, op_flags); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7724de18d2fe..2a685b45b73b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -963,7 +963,7 @@ static void blk_add_trace_bio_remap(void *ignore, return; r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.device_to = cpu_to_be32(bio_dev(bio)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, -- cgit v1.2.3