summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/statistics.txt19
-rw-r--r--drivers/md/dm-stats.c205
2 files changed, 199 insertions, 25 deletions
diff --git a/Documentation/device-mapper/statistics.txt b/Documentation/device-mapper/statistics.txt
index ff6baeaa71f7..4919b2dfd1b3 100644
--- a/Documentation/device-mapper/statistics.txt
+++ b/Documentation/device-mapper/statistics.txt
@@ -13,9 +13,10 @@ the range specified.
The I/O statistics counters for each step-sized area of a region are
in the same format as /sys/block/*/stat or /proc/diskstats (see:
Documentation/iostats.txt). But two extra counters (12 and 13) are
-provided: total time spent reading and writing. All these counters may
-be accessed by sending the @stats_print message to the appropriate DM
-device via dmsetup.
+provided: total time spent reading and writing. When the histogram
+argument is used, the 14th parameter is reported that represents the
+histogram of latencies. All these counters may be accessed by sending
+the @stats_print message to the appropriate DM device via dmsetup.
The reported times are in milliseconds and the granularity depends on
the kernel ticks. When the option precise_timestamps is used, the
@@ -64,6 +65,18 @@ Messages
used, the resulting times are in nanoseconds instead of
milliseconds. Precise timestamps are a little bit slower
to obtain than jiffies-based timestamps.
+ histogram:n1,n2,n3,n4,... - collect histogram of latencies. The
+ numbers n1, n2, etc are times that represent the boundaries
+ of the histogram. If precise_timestamps is not used, the
+ times are in milliseconds, otherwise they are in
+ nanoseconds. For each range, the kernel will report the
+ number of requests that completed within this range. For
+ example, if we use "histogram:10,20,30", the kernel will
+ report four numbers a:b:c:d. a is the number of requests
+ that took 0-10 ms to complete, b is the number of requests
+ that took 10-20 ms to complete, c is the number of requests
+ that took 20-30 ms to complete and d is the number of
+ requests that took more than 30 ms to complete.
<program_id>
An optional parameter. A name that uniquely identifies
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 4bfd84ab1d4a..faf1071ef631 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -29,6 +29,7 @@ struct dm_stat_percpu {
unsigned long long io_ticks[2];
unsigned long long io_ticks_total;
unsigned long long time_in_queue;
+ unsigned long long *histogram;
};
struct dm_stat_shared {
@@ -45,11 +46,14 @@ struct dm_stat {
sector_t start;
sector_t end;
sector_t step;
+ unsigned n_histogram_entries;
+ unsigned long long *histogram_boundaries;
const char *program_id;
const char *aux_data;
struct rcu_head rcu_head;
size_t shared_alloc_size;
size_t percpu_alloc_size;
+ size_t histogram_alloc_size;
struct dm_stat_percpu *stat_percpu[NR_CPUS];
struct dm_stat_shared stat_shared[0];
};
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
kfree(s->program_id);
kfree(s->aux_data);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
+ dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+ }
+ dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
dm_kvfree(s, s->shared_alloc_size);
}
@@ -228,6 +235,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
sector_t step, unsigned stat_flags,
+ unsigned n_histogram_entries,
+ unsigned long long *histogram_boundaries,
const char *program_id, const char *aux_data,
void (*suspend_callback)(struct mapped_device *),
void (*resume_callback)(struct mapped_device *),
@@ -239,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
size_t ni;
size_t shared_alloc_size;
size_t percpu_alloc_size;
+ size_t histogram_alloc_size;
struct dm_stat_percpu *p;
int cpu;
int ret_id;
@@ -262,7 +272,12 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
return -EOVERFLOW;
- if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+ histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
+ if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
+ return -EOVERFLOW;
+
+ if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
+ num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
return -ENOMEM;
s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
@@ -276,6 +291,15 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
s->step = step;
s->shared_alloc_size = shared_alloc_size;
s->percpu_alloc_size = percpu_alloc_size;
+ s->histogram_alloc_size = histogram_alloc_size;
+
+ s->n_histogram_entries = n_histogram_entries;
+ s->histogram_boundaries = kmemdup(histogram_boundaries,
+ s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+ if (!s->histogram_boundaries) {
+ r = -ENOMEM;
+ goto out;
+ }
s->program_id = kstrdup(program_id, GFP_KERNEL);
if (!s->program_id) {
@@ -293,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
}
+ if (s->n_histogram_entries) {
+ unsigned long long *hi;
+ hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
+ if (!hi) {
+ r = -ENOMEM;
+ goto out;
+ }
+ for (ni = 0; ni < n_entries; ni++) {
+ s->stat_shared[ni].tmp.histogram = hi;
+ hi += s->n_histogram_entries + 1;
+ }
+ }
+
for_each_possible_cpu(cpu) {
p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
if (!p) {
@@ -300,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
goto out;
}
s->stat_percpu[cpu] = p;
+ if (s->n_histogram_entries) {
+ unsigned long long *hi;
+ hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
+ if (!hi) {
+ r = -ENOMEM;
+ goto out;
+ }
+ for (ni = 0; ni < n_entries; ni++) {
+ p[ni].histogram = hi;
+ hi += s->n_histogram_entries + 1;
+ }
+ }
}
/*
@@ -377,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
* vfree can't be called from RCU callback
*/
for_each_possible_cpu(cpu)
- if (is_vmalloc_addr(s->stat_percpu))
+ if (is_vmalloc_addr(s->stat_percpu) ||
+ is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
goto do_sync_free;
- if (is_vmalloc_addr(s)) {
+ if (is_vmalloc_addr(s) ||
+ is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
do_sync_free:
synchronize_rcu_expedited();
dm_stat_free(&s->rcu_head);
@@ -486,15 +537,32 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
dm_stat_round(s, shared, p);
atomic_inc(&shared->in_flight[idx]);
} else {
+ unsigned long long duration;
dm_stat_round(s, shared, p);
atomic_dec(&shared->in_flight[idx]);
p->sectors[idx] += len;
p->ios[idx] += 1;
p->merges[idx] += stats_aux->merged;
- if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
+ if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
p->ticks[idx] += duration_jiffies;
- else
+ duration = jiffies_to_msecs(duration_jiffies);
+ } else {
p->ticks[idx] += stats_aux->duration_ns;
+ duration = stats_aux->duration_ns;
+ }
+ if (s->n_histogram_entries) {
+ unsigned lo = 0, hi = s->n_histogram_entries + 1;
+ while (lo + 1 < hi) {
+ unsigned mid = (lo + hi) / 2;
+ if (s->histogram_boundaries[mid - 1] > duration) {
+ hi = mid;
+ } else {
+ lo = mid;
+ }
+
+ }
+ p->histogram[lo]++;
+ }
}
#if BITS_PER_LONG == 32
@@ -600,7 +668,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
dm_stat_round(s, shared, p);
local_irq_enable();
- memset(&shared->tmp, 0, sizeof(shared->tmp));
+ shared->tmp.sectors[READ] = 0;
+ shared->tmp.sectors[WRITE] = 0;
+ shared->tmp.ios[READ] = 0;
+ shared->tmp.ios[WRITE] = 0;
+ shared->tmp.merges[READ] = 0;
+ shared->tmp.merges[WRITE] = 0;
+ shared->tmp.ticks[READ] = 0;
+ shared->tmp.ticks[WRITE] = 0;
+ shared->tmp.io_ticks[READ] = 0;
+ shared->tmp.io_ticks[WRITE] = 0;
+ shared->tmp.io_ticks_total = 0;
+ shared->tmp.time_in_queue = 0;
+
+ if (s->n_histogram_entries)
+ memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
+
for_each_possible_cpu(cpu) {
p = &s->stat_percpu[cpu][x];
shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -615,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++)
+ shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+ }
}
}
@@ -644,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
p->io_ticks_total -= shared->tmp.io_ticks_total;
p->time_in_queue -= shared->tmp.time_in_queue;
local_irq_enable();
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++) {
+ local_irq_disable();
+ p = &s->stat_percpu[smp_processor_id()][x];
+ p->histogram[i] -= shared->tmp.histogram[i];
+ local_irq_enable();
+ }
+ }
}
}
@@ -733,7 +830,7 @@ static int dm_stats_print(struct dm_stats *stats, int id,
__dm_stat_init_temporary_percpu_totals(shared, s, x);
- DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+ DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
(unsigned long long)start,
(unsigned long long)step,
shared->tmp.ios[READ],
@@ -749,6 +846,13 @@ static int dm_stats_print(struct dm_stats *stats, int id,
dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
+ if (s->n_histogram_entries) {
+ unsigned i;
+ for (i = 0; i < s->n_histogram_entries + 1; i++) {
+ DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
+ }
+ }
+ DMEMIT("\n");
if (unlikely(sz + 1 >= maxlen))
goto buffer_overflow;
@@ -790,10 +894,47 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
return 0;
}
+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
+ unsigned long long **histogram_boundaries)
+{
+ const char *q;
+ unsigned n;
+ unsigned long long last;
+
+ *n_histogram_entries = 1;
+ for (q = h; *q; q++)
+ if (*q == ',')
+ (*n_histogram_entries)++;
+
+ *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+ if (!*histogram_boundaries)
+ return -ENOMEM;
+
+ n = 0;
+ last = 0;
+ while (1) {
+ unsigned long long hi;
+ int s;
+ char ch;
+ s = sscanf(h, "%llu%c", &hi, &ch);
+ if (!s || (s == 2 && ch != ','))
+ return -EINVAL;
+ if (hi <= last)
+ return -EINVAL;
+ last = hi;
+ (*histogram_boundaries)[n] = hi;
+ if (s == 1)
+ return 0;
+ h = strchr(h, ',') + 1;
+ n++;
+ }
+}
+
static int message_stats_create(struct mapped_device *md,
unsigned argc, char **argv,
char *result, unsigned maxlen)
{
+ int r;
int id;
char dummy;
unsigned long long start, end, len, step;
@@ -801,6 +942,9 @@ static int message_stats_create(struct mapped_device *md,
const char *program_id, *aux_data;
unsigned stat_flags = 0;
+ unsigned n_histogram_entries = 0;
+ unsigned long long *histogram_boundaries = NULL;
+
struct dm_arg_set as, as_backup;
const char *a;
unsigned feature_args;
@@ -811,7 +955,7 @@ static int message_stats_create(struct mapped_device *md,
*/
if (argc < 3)
- return -EINVAL;
+ goto ret_einval;
as.argc = argc;
as.argv = argv;
@@ -825,11 +969,11 @@ static int message_stats_create(struct mapped_device *md,
len = 1;
} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
start != (sector_t)start || len != (sector_t)len)
- return -EINVAL;
+ goto ret_einval;
end = start + len;
if (start >= end)
- return -EINVAL;
+ goto ret_einval;
a = dm_shift_arg(&as);
if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
@@ -842,7 +986,7 @@ static int message_stats_create(struct mapped_device *md,
step = 1;
} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
step != (sector_t)step || !step)
- return -EINVAL;
+ goto ret_einval;
as_backup = as;
a = dm_shift_arg(&as);
@@ -850,11 +994,16 @@ static int message_stats_create(struct mapped_device *md,
while (feature_args--) {
a = dm_shift_arg(&as);
if (!a)
- return -EINVAL;
+ goto ret_einval;
if (!strcasecmp(a, "precise_timestamps"))
stat_flags |= STAT_PRECISE_TIMESTAMPS;
- else
- return -EINVAL;
+ else if (!strncasecmp(a, "histogram:", 10)) {
+ if (n_histogram_entries)
+ goto ret_einval;
+ if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
+ goto ret;
+ } else
+ goto ret_einval;
}
} else {
as = as_backup;
@@ -872,7 +1021,7 @@ static int message_stats_create(struct mapped_device *md,
aux_data = a;
if (as.argc)
- return -EINVAL;
+ goto ret_einval;
/*
* If a buffer overflow happens after we created the region,
@@ -881,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
* leaked). So we must detect buffer overflow in advance.
*/
snprintf(result, maxlen, "%d", INT_MAX);
- if (dm_message_test_buffer_overflow(result, maxlen))
- return 1;
+ if (dm_message_test_buffer_overflow(result, maxlen)) {
+ r = 1;
+ goto ret;
+ }
- id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
+ id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
+ n_histogram_entries, histogram_boundaries, program_id, aux_data,
dm_internal_suspend_fast, dm_internal_resume_fast, md);
- if (id < 0)
- return id;
+ if (id < 0) {
+ r = id;
+ goto ret;
+ }
snprintf(result, maxlen, "%d", id);
- return 1;
+ r = 1;
+ goto ret;
+
+ret_einval:
+ r = -EINVAL;
+ret:
+ kfree(histogram_boundaries);
+ return r;
}
static int message_stats_delete(struct mapped_device *md,