From cbe5e6109538ddab57764a88d9f0c2accd0c7d48 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 22 Mar 2013 22:17:36 -0600 Subject: lru_cache: introduce lc_get_cumulative() New helper to be able to consolidate more updates into a single transaction. Without this, we can only grab a single refcount on an updated element while preparing a transaction. lc_get_cumulative - like lc_get; also finds to-be-changed elements @lc: the lru cache to operate on @enr: the label to look up Unlike lc_get this also returns the element for @enr, if it is belonging to a pending transaction, so the return values are like for lc_get(), plus: pointer to an element already on the "to_be_changed" list. In this case, the cache was already marked %LC_DIRTY. Caller needs to make sure that the pending transaction is completed, before proceeding to actually use this element. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Fixed up by Jens to export lc_get_cumulative(). Signed-off-by: Jens Axboe --- include/linux/lru_cache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 4019013c6593..46262284de47 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc); extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); extern void lc_del(struct lru_cache *lc, struct lc_element *element); +extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); -- cgit v1.2.3 From 5bbcf5e6abe97485748b51ea0713cc3012b4a8f0 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 19 Mar 2013 18:16:59 +0100 Subject: drbd: adjust upper limit for activity log extents Now that the on-disk activity-log ring buffer size is adjustable, the maximum active set can become larger, and is now limited by the use of 16bit "labels". This increases the maximum working set from 6433 to 65534 extents, each of which covers an area of 4MiB. Which means that if you use the maximum, you'd have to resync more than 250 GiB after an unclean Primary shutdown. With capable backend storage and replication links, this is entirely feasible. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd_limits.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 1fa19c5f5e64..1fedf2b17cc8 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -126,13 +126,12 @@ #define DRBD_RESYNC_RATE_DEF 250 #define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */ - /* less than 7 would hit performance unnecessarily. - * 919 slots context information per transaction, - * 32k activity log, 4k transaction size, - * one transaction in flight: - * 919 * 7 = 6433 */ + /* less than 7 would hit performance unnecessarily. */ #define DRBD_AL_EXTENTS_MIN 7 -#define DRBD_AL_EXTENTS_MAX 6433 + /* we use u16 as "slot number", (u16)~0 is "FREE". + * If you use >= 292 kB on-disk ring buffer, + * this is the maximum you can use: */ +#define DRBD_AL_EXTENTS_MAX 0xfffe #define DRBD_AL_EXTENTS_DEF 1237 #define DRBD_AL_EXTENTS_SCALE '1' -- cgit v1.2.3 From 84759c6d18c5144432781ddca037d929ee9db8a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 21 Sep 2011 21:43:05 -0700 Subject: Revert "rw_semaphore: remove up/down_read_non_owner" This reverts commit 11b80f459adaf91a712f95e7734a17655a36bf30. Bcache needs rw semaphores for cache coherency in writeback mode - writes have to take a read lock on a per cache device rw sem, and release it when the bio completes. But since this is for bios it's naturally not in the context of the process that originally took the lock. Signed-off-by: Kent Overstreet CC: Christoph Hellwig CC: David Howells --- include/linux/rwsem.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8da67d625e13..0616ffe45702 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -133,10 +133,20 @@ do { \ _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ } while (0); +/* + * Take/release a lock when not the owner will release it. + * + * [ This API should be avoided as much as possible - the + * proper abstraction for this case is completions. ] + */ +extern void down_read_non_owner(struct rw_semaphore *sem); +extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) +# define down_read_non_owner(sem) down_read(sem) +# define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */ -- cgit v1.2.3 From cafe563591446cf80bfbc2fe3bc72a2e36cf1060 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Mar 2013 16:11:31 -0700 Subject: bcache: A block layer cache Does writethrough and writeback caching, handles unclean shutdown, and has a bunch of other nifty features motivated by real world usage. See the wiki at http://bcache.evilpiepirate.org for more. Signed-off-by: Kent Overstreet --- include/linux/cgroup_subsys.h | 6 + include/linux/sched.h | 4 + include/trace/events/bcache.h | 271 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 281 insertions(+) create mode 100644 include/trace/events/bcache.h (limited to 'include') diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index f204a7a9cf38..6e7ec64b69ab 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -78,3 +78,9 @@ SUBSYS(hugetlb) #endif /* */ + +#ifdef CONFIG_CGROUP_BCACHE +SUBSYS(bcache) +#endif + +/* */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..a8482d063bc3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1576,6 +1576,10 @@ struct task_struct { #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif +#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) + unsigned int sequential_io; + unsigned int sequential_io_avg; +#endif }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h new file mode 100644 index 000000000000..3cc5a0b278c3 --- /dev/null +++ b/include/trace/events/bcache.h @@ -0,0 +1,271 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcache + +#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BCACHE_H + +#include + +struct search; + +DECLARE_EVENT_CLASS(bcache_request, + + TP_PROTO(struct search *s, struct bio *bio), + + TP_ARGS(s, bio), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned int, orig_major ) + __field(unsigned int, orig_minor ) + __field(sector_t, sector ) + __field(dev_t, orig_sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __array(char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->orig_major = s->d->disk->major; + __entry->orig_minor = s->d->disk->first_minor; + __entry->sector = bio->bi_sector; + __entry->orig_sector = bio->bi_sector - 16; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm, + __entry->orig_major, __entry->orig_minor, + (unsigned long long)__entry->orig_sector) +); + +DEFINE_EVENT(bcache_request, bcache_request_start, + + TP_PROTO(struct search *s, struct bio *bio), + + TP_ARGS(s, bio) +); + +DEFINE_EVENT(bcache_request, bcache_request_end, + + TP_PROTO(struct search *s, struct bio *bio), + + TP_ARGS(s, bio) +); + +DECLARE_EVENT_CLASS(bcache_bio, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __array(char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) +); + + +DEFINE_EVENT(bcache_bio, bcache_passthrough, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_cache_hit, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_cache_miss, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_read_retry, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_writethrough, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_writeback, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_write_skip, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_btree_read, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_btree_write, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_write_dirty, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_read_dirty, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_write_moving, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_read_moving, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DEFINE_EVENT(bcache_bio, bcache_journal_write, + + TP_PROTO(struct bio *bio), + + TP_ARGS(bio) +); + +DECLARE_EVENT_CLASS(bcache_cache_bio, + + TP_PROTO(struct bio *bio, + sector_t orig_sector, + struct block_device* orig_bdev), + + TP_ARGS(bio, orig_sector, orig_bdev), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(dev_t, orig_dev ) + __field(sector_t, sector ) + __field(sector_t, orig_sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __array(char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->orig_dev = orig_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->orig_sector = orig_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm, + MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), + (unsigned long long)__entry->orig_sector) +); + +DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, + + TP_PROTO(struct bio *bio, + sector_t orig_sector, + struct block_device *orig_bdev), + + TP_ARGS(bio, orig_sector, orig_bdev) +); + +DECLARE_EVENT_CLASS(bcache_gc, + + TP_PROTO(uint8_t *uuid), + + TP_ARGS(uuid), + + TP_STRUCT__entry( + __field(uint8_t *, uuid) + ), + + TP_fast_assign( + __entry->uuid = uuid; + ), + + TP_printk("%pU", __entry->uuid) +); + + +DEFINE_EVENT(bcache_gc, bcache_gc_start, + + TP_PROTO(uint8_t *uuid), + + TP_ARGS(uuid) +); + +DEFINE_EVENT(bcache_gc, bcache_gc_end, + + TP_PROTO(uint8_t *uuid), + + TP_ARGS(uuid) +); + +#endif /* _TRACE_BCACHE_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From b949be5857a4033e00fed67b707774f52619ce60 Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 27 Mar 2013 14:08:33 +0100 Subject: idr: document exit conditions on idr_for_each_entry better And some manual common subexpression elimination which may help the compiler produce smaller code. Signed-off-by: George Spelvin Signed-off-by: Philipp Reisner Signed-off-by: Jens Axboe --- include/linux/idr.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/idr.h b/include/linux/idr.h index 2640c7e99e51..6ece0583362a 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -122,11 +122,13 @@ static inline void *idr_find(struct idr *idr, int id) * @idp: idr handle * @entry: the type * to use as cursor * @id: id entry's key + * + * @entry and @id do not need to be initialized before the loop, and + * after normal terminatinon @entry is left with the value NULL. This + * is convenient for a "not found" value. */ -#define idr_for_each_entry(idp, entry, id) \ - for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \ - entry != NULL; \ - ++id, entry = (typeof(entry))idr_get_next((idp), &(id))) +#define idr_for_each_entry(idp, entry, id) \ + for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) /* * Don't use the following functions. These exist only to suppress -- cgit v1.2.3 From 2bd5ed5d6713594eb2b4d234d01217d506279c7d Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:40 +0100 Subject: drbd: Fix disconnect to keep the peer disk state if connection breaks during operation The issue was that if the connection broke while we did the gracefull state change to C_DISCONNECTING (C_TEARDOWN), then we returned a success code from the state engine. (SS_CW_NO_NEED) The result of that is that we missed to call the fence-peer script in such a case. Fixed that by introducing a new error code (SS_OUTDATE_WO_CONN). This one should never reach back into user space. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 0c5a18ec322c..316330705fd7 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -319,7 +319,8 @@ enum drbd_state_rv { SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ SS_O_VOL_PEER_PRI = -20, - SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */ + SS_OUTDATE_WO_CONN = -21, + SS_AFTER_LAST_ERROR = -22, /* Keep this at bottom */ }; /* from drbd_strings.c */ -- cgit v1.2.3 From 3990e04df085e0561ab34f84731dc5929585c526 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 27 Mar 2013 14:08:48 +0100 Subject: drbd: use sched_setscheduler() It was unnoticed for some time that assigning to current->policy is no longer sufficient to set a real time priority for a kernel thread. Reported-by: Charlie Suffin Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 316330705fd7..1b4d4ee1168f 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -52,7 +52,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.2" +#define REL_VERSION "8.4.3" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 -- cgit v1.2.3