From e09b457bdb7e8d23fc54dcef0930ac697d8de895 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 13 Nov 2010 11:55:17 +0100 Subject: block: simplify holder symlink handling Code to manage symlinks in /sys/block/*/{holders|slaves} are overly complex with multiple holder considerations, redundant extra references to all involved kobjects, unused generic kobject holder support and unnecessary mixup with bd_claim/release functionalities. Strip it down to what's necessary (single gendisk holder) and make it use a separate interface. This is a step for cleaning up bd_claim/release. This patch makes dm-table slightly more complex but it will be simplified again with further changes. Signed-off-by: Tejun Heo Acked-by: Neil Brown Acked-by: Mike Snitzer Cc: dm-devel@redhat.com --- include/linux/fs.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 334d68a17108..66b7f2c5d7e9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -663,7 +663,7 @@ struct block_device { void * bd_holder; int bd_holders; #ifdef CONFIG_SYSFS - struct list_head bd_holder_list; + struct gendisk * bd_holder_disk; /* for sysfs slave linkng */ #endif struct block_device * bd_contains; unsigned bd_block_size; @@ -2042,11 +2042,17 @@ extern int blkdev_put(struct block_device *, fmode_t); extern int bd_claim(struct block_device *, void *); extern void bd_release(struct block_device *); #ifdef CONFIG_SYSFS -extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *); -extern void bd_release_from_disk(struct block_device *, struct gendisk *); +extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); +extern void bd_unlink_disk_holder(struct block_device *bdev); #else -#define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder) -#define bd_release_from_disk(bdev, disk) bd_release(bdev) +static inline int bd_link_disk_holder(struct block_device *bdev, + struct gendisk *disk) +{ + return 0; +} +static inline void bd_unlink_disk_holder(struct block_device *bdev) +{ +} #endif #endif -- cgit v1.2.3 From e525fd89d380c4a94c0d63913a1dd1a593ed25e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 13 Nov 2010 11:55:17 +0100 Subject: block: make blkdev_get/put() handle exclusive access Over time, block layer has accumulated a set of APIs dealing with bdev open, close, claim and release. * blkdev_get/put() are the primary open and close functions. * bd_claim/release() deal with exclusive open. * open/close_bdev_exclusive() are combination of open and claim and the other way around, respectively. * bd_link/unlink_disk_holder() to create and remove holder/slave symlinks. * open_by_devnum() wraps bdget() + blkdev_get(). The interface is a bit confusing and the decoupling of open and claim makes it impossible to properly guarantee exclusive access as in-kernel open + claim sequence can disturb the existing exclusive open even before the block layer knows the current open if for another exclusive access. Reorganize the interface such that, * blkdev_get() is extended to include exclusive access management. @holder argument is added and, if is @FMODE_EXCL specified, it will gain exclusive access atomically w.r.t. other exclusive accesses. * blkdev_put() is similarly extended. It now takes @mode argument and if @FMODE_EXCL is set, it releases an exclusive access. Also, when the last exclusive claim is released, the holder/slave symlinks are removed automatically. * bd_claim/release() and close_bdev_exclusive() are no longer necessary and either made static or removed. * bd_link_disk_holder() remains the same but bd_unlink_disk_holder() is no longer necessary and removed. * open_bdev_exclusive() becomes a simple wrapper around lookup_bdev() and blkdev_get(). It also has an unexpected extra bdev_read_only() test which probably should be moved into blkdev_get(). * open_by_devnum() is modified to take @holder argument and pass it to blkdev_get(). Most of bdev open/close operations are unified into blkdev_get/put() and most exclusive accesses are tested atomically at the open time (as it should). This cleans up code and removes some, both valid and invalid, but unnecessary all the same, corner cases. open_bdev_exclusive() and open_by_devnum() can use further cleanup - rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop special features. Well, let's leave them for another day. Most conversions are straight-forward. drbd conversion is a bit more involved as there was some reordering, but the logic should stay the same. Signed-off-by: Tejun Heo Acked-by: Neil Brown Acked-by: Ryusuke Konishi Acked-by: Mike Snitzer Acked-by: Philipp Reisner Cc: Peter Osterlund Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Jan Kara Cc: Andrew Morton Cc: Andreas Dilger Cc: "Theodore Ts'o" Cc: Mark Fasheh Cc: Joel Becker Cc: Alex Elder Cc: Christoph Hellwig Cc: dm-devel@redhat.com Cc: drbd-dev@lists.linbit.com Cc: Leo Chen Cc: Scott Branden Cc: Chris Mason Cc: Steven Whitehouse Cc: Dave Kleikamp Cc: Joern Engel Cc: reiserfs-devel@vger.kernel.org Cc: Alexander Viro --- include/linux/fs.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 66b7f2c5d7e9..1a033e8ebe4c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2006,7 +2006,8 @@ extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); -extern struct block_device *open_by_devnum(dev_t, fmode_t); +extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode, + void *holder); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); @@ -2037,22 +2038,16 @@ extern const struct file_operations def_fifo_fops; extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); -extern int blkdev_get(struct block_device *, fmode_t); -extern int blkdev_put(struct block_device *, fmode_t); -extern int bd_claim(struct block_device *, void *); -extern void bd_release(struct block_device *); +extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); +extern int blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); -extern void bd_unlink_disk_holder(struct block_device *bdev); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { return 0; } -static inline void bd_unlink_disk_holder(struct block_device *bdev) -{ -} #endif #endif @@ -2089,7 +2084,6 @@ extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); -extern void close_bdev_exclusive(struct block_device *, fmode_t); extern void blkdev_show(struct seq_file *,off_t); #else -- cgit v1.2.3 From d4d77629953eabd3c14f6fa5746f6b28babfc55f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 13 Nov 2010 11:55:18 +0100 Subject: block: clean up blkdev_get() wrappers and their users After recent blkdev_get() modifications, open_by_devnum() and open_bdev_exclusive() are simple wrappers around blkdev_get(). Replace them with blkdev_get_by_dev() and blkdev_get_by_path(). blkdev_get_by_dev() is identical to open_by_devnum(). blkdev_get_by_path() is slightly different in that it doesn't automatically add %FMODE_EXCL to @mode. All users are converted. Most conversions are mechanical and don't introduce any behavior difference. There are several exceptions. * btrfs now sets FMODE_EXCL in btrfs_device->mode, so there's no reason to OR it explicitly on blkdev_put(). * gfs2, nilfs2 and the generic mount_bdev() now set FMODE_EXCL in sb->s_mode. * With the above changes, sb->s_mode now always should contain FMODE_EXCL. WARN_ON_ONCE() added to kill_block_super() to detect errors. The new blkdev_get_*() functions are with proper docbook comments. While at it, add function description to blkdev_get() too. Signed-off-by: Tejun Heo Cc: Philipp Reisner Cc: Neil Brown Cc: Mike Snitzer Cc: Joern Engel Cc: Chris Mason Cc: Jan Kara Cc: "Theodore Ts'o" Cc: KONISHI Ryusuke Cc: reiserfs-devel@vger.kernel.org Cc: xfs-masters@oss.sgi.com Cc: Alexander Viro --- include/linux/fs.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a033e8ebe4c..f48501563917 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2006,8 +2006,6 @@ extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); -extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode, - void *holder); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); @@ -2039,6 +2037,10 @@ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); +extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder); +extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, + void *holder); extern int blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); @@ -2083,7 +2085,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name) extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); -extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); extern void blkdev_show(struct seq_file *,off_t); #else -- cgit v1.2.3 From dddd9dc340ae1a41d90e084529ca979c77c4ecfe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Dec 2010 20:57:35 +0100 Subject: block: kill genhd_media_change_notify() There's no user of the facility. Kill it. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 7a7b9c1644e4..5e4e6928387c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -173,7 +173,6 @@ struct gendisk { struct timer_rand_state *random; atomic_t sync_io; /* RAID */ - struct work_struct async_notify; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif -- cgit v1.2.3 From d2bf1b6723ed0eab378363649d15b7893bf14e91 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Dec 2010 20:57:36 +0100 Subject: block: move register_disk() and del_gendisk() to block/genhd.c There's no reason for register_disk() and del_gendisk() to be in fs/partitions/check.c. Move both to genhd.c. While at it, collapse unlink_gendisk(), which was artificially in a separate function due to genhd.c / check.c split, into del_gendisk(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - include/linux/genhd.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4f..83031bcf8366 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -643,7 +643,6 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -extern void register_disk(struct gendisk *dev); extern void generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 5e4e6928387c..56e17ed24816 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -394,7 +394,6 @@ extern void part_round_stats(int cpu, struct hd_struct *part); /* block/genhd.c */ extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); -extern void unlink_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -- cgit v1.2.3 From 77ea887e433ad8389d416826936c110fa7910f80 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Dec 2010 20:57:37 +0100 Subject: implement in-kernel gendisk events handling Currently, media presence polling for removeable block devices is done from userland. There are several issues with this. * Polling is done by periodically opening the device. For SCSI devices, the command sequence generated by such action involves a few different commands including TEST_UNIT_READY. This behavior, while perfectly legal, is different from Windows which only issues single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some ATAPI devices lock up after being periodically queried such command sequences. * There is no reliable and unintrusive way for a userland program to tell whether the target device is safe for media presence polling. For example, polling for media presence during an on-going burning session can make it fail. The polling program can avoid this by opening the device with O_EXCL but then it risks making a valid exclusive user of the device fail w/ -EBUSY. * Userland polling is unnecessarily heavy and in-kernel implementation is lighter and better coordinated (workqueue, timer slack). This patch implements framework for in-kernel disk event handling, which includes media presence polling. * bdops->check_events() is added, which supercedes ->media_changed(). It should check whether there's any pending event and return if so. Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be called parallelly. * gendisk->events and ->async_events are added. These should be initialized by block driver before passing the device to add_disk(). The former contains the mask of all supported events and the latter the mask of all events which the device can report without polling. /sys/block/*/events[_async] export these to userland. * Kernel parameter block.events_dfl_poll_msecs controls the system polling interval (default is 0 which means disable) and /sys/block/*/events_poll_msecs control polling intervals for individual devices (default is -1 meaning use system setting). Note that if a device can report all supported events asynchronously and its polling interval isn't explicitly set, the device won't be polled regardless of the system polling interval. * If a device is opened exclusively with write access, event checking is automatically disabled until all write exclusive accesses are released. * There are event 'clearing' events. For example, both of currently defined events are cleared after the device has been successfully opened. This information is passed to ->check_events() callback using @clearing argument as a hint. * Event checking is always performed from system_nrt_wq and timer slack is set to 25% for polling. * Nothing changes for drivers which implement ->media_changed() but not ->check_events(). Going forward, all drivers will be converted to ->check_events() and ->media_change() will be dropped. Signed-off-by: Tejun Heo Cc: Kay Sievers Cc: Jan Kara Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ include/linux/fs.h | 1 + include/linux/genhd.h | 18 +++++++++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 83031bcf8366..05667e6989f1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1251,6 +1251,9 @@ struct block_device_operations { int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); + unsigned int (*check_events) (struct gendisk *disk, + unsigned int clearing); + /* ->media_changed() is DEPRECATED, use ->check_events() instead */ int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); diff --git a/include/linux/fs.h b/include/linux/fs.h index f48501563917..997d22efdef0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -662,6 +662,7 @@ struct block_device { void * bd_claiming; void * bd_holder; int bd_holders; + bool bd_write_holder; #ifdef CONFIG_SYSFS struct gendisk * bd_holder_disk; /* for sysfs slave linkng */ #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 56e17ed24816..13893aa2ac9d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -127,6 +127,11 @@ struct hd_struct { #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */ #define GENHD_FL_NATIVE_CAPACITY 128 +enum { + DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */ + DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */ +}; + #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) @@ -143,6 +148,8 @@ struct disk_part_tbl { struct hd_struct __rcu *part[]; }; +struct disk_events; + struct gendisk { /* major, first_minor and minors are input parameters only, * don't use directly. Use disk_devt() and disk_max_parts(). @@ -154,6 +161,10 @@ struct gendisk { char disk_name[DISK_NAME_LEN]; /* name of major driver */ char *(*devnode)(struct gendisk *gd, mode_t *mode); + + unsigned int events; /* supported events */ + unsigned int async_events; /* async events, subset of all */ + /* Array of pointers to partitions indexed by partno. * Protected with matching bdev lock but stat and other * non-critical accesses use RCU. Always access through @@ -171,8 +182,8 @@ struct gendisk { struct kobject *slave_dir; struct timer_rand_state *random; - atomic_t sync_io; /* RAID */ + struct disk_events *ev; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity *integrity; #endif @@ -405,6 +416,11 @@ static inline int get_disk_ro(struct gendisk *disk) return disk->part0.policy; } +extern void disk_block_events(struct gendisk *disk); +extern void disk_unblock_events(struct gendisk *disk); +extern void disk_check_events(struct gendisk *disk); +extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); + /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk); extern void rand_initialize_disk(struct gendisk *disk); -- cgit v1.2.3 From 2d9217296bfa6fdc0d3707264076e5296faffdbd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Dec 2010 20:57:38 +0100 Subject: cdrom: add ->check_events() support In principle, cdrom just needs to pass through ->check_events() but CDROM_MEDIA_CHANGED ioctl makes things a bit more complex. Just as with ->media_changed() support, cdrom code needs to buffer the events and serve them to ioctl and vfs as requested. As the code has to deal with both ->check_events() and ->media_changed(), and vfs and ioctl event buffering, this patch adds check_events caching on top of the existing cdi->mc_flags buffering. It may be a good idea to deprecate CDROM_MEDIA_CHANGED ioctl and remove all this mess. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/cdrom.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 78e904796622..35eae4b67503 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -946,6 +946,8 @@ struct cdrom_device_info { /* device-related storage */ unsigned int options : 30; /* options flags */ unsigned mc_flags : 2; /* media change buffer flags */ + unsigned int vfs_events; /* cached events for vfs path */ + unsigned int ioctl_events; /* cached events for ioctl path */ int use_count; /* number of times device opened */ char name[20]; /* name of the device type */ /* per-device flags */ @@ -965,6 +967,8 @@ struct cdrom_device_ops { int (*open) (struct cdrom_device_info *, int); void (*release) (struct cdrom_device_info *); int (*drive_status) (struct cdrom_device_info *, int); + unsigned int (*check_events) (struct cdrom_device_info *cdi, + unsigned int clearing, int slot); int (*media_changed) (struct cdrom_device_info *, int); int (*tray_move) (struct cdrom_device_info *, int); int (*lock_door) (struct cdrom_device_info *, int); @@ -993,6 +997,8 @@ extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode); extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); +extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, + unsigned int clearing); extern int cdrom_media_changed(struct cdrom_device_info *); extern int register_cdrom(struct cdrom_device_info *cdi); -- cgit v1.2.3