From 66ba32dc167202c3cf8c86806581a9393ec7f488 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 18 Sep 2012 12:19:29 -0400 Subject: block: ioctl to zero block ranges Introduce a BLKZEROOUT ioctl which can be used to clear block ranges by way of blkdev_issue_zeroout(). Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index aa110476a95b..bd6f6e7ca48e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -335,6 +335,7 @@ struct inodes_stat_t { #define BLKDISCARDZEROES _IO(0x12,124) #define BLKSECDISCARD _IO(0x12,125) #define BLKROTATIONAL _IO(0x12,126) +#define BLKZEROOUT _IO(0x12,127) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From b87570f5d349661814b262dd5fc40787700f80d6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 26 Sep 2012 07:46:40 +0200 Subject: Fix a crash when block device is read and block size is changed at the same time The kernel may crash when block size is changed and I/O is issued simultaneously. Because some subsystems (udev or lvm) may read any block device anytime, the bug actually puts any code that changes a block device size in jeopardy. The crash can be reproduced if you place "msleep(1000)" to blkdev_get_blocks just before "bh->b_size = max_blocks << inode->i_blkbits;". Then, run "dd if=/dev/ram0 of=/dev/null bs=4k count=1 iflag=direct" While it is waiting in msleep, run "blockdev --setbsz 2048 /dev/ram0" You get a BUG. The direct and non-direct I/O is written with the assumption that block size does not change. It doesn't seem practical to fix these crashes one-by-one there may be many crash possibilities when block size changes at a certain place and it is impossible to find them all and verify the code. This patch introduces a new rw-lock bd_block_size_semaphore. The lock is taken for read during I/O. It is taken for write when changing block size. Consequently, block size can't be changed while I/O is being submitted. For asynchronous I/O, the patch only prevents block size change while the I/O is being submitted. The block size can change when the I/O is in progress or when the I/O is being finished. This is acceptable because there are no accesses to block size when asynchronous I/O is being finished. The patch prevents block size changing while the device is mapped with mmap. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index bd6f6e7ca48e..e60bbd0225d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -725,6 +725,8 @@ struct block_device { int bd_fsfreeze_count; /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; + /* A semaphore that prevents I/O while block size is being changed */ + struct rw_semaphore bd_block_size_semaphore; }; /* @@ -2565,6 +2567,8 @@ extern int generic_segment_checks(const struct iovec *iov, unsigned long *nr_segs, size_t *count, int access_flags); /* fs/block_dev.c */ +extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, -- cgit v1.2.3 From 62ac665ff9fc07497ca524bd20d6a96893d11071 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 26 Sep 2012 07:46:43 +0200 Subject: blockdev: turn a rw semaphore into a percpu rw semaphore This avoids cache line bouncing when many processes lock the semaphore for read. New percpu lock implementation The lock consists of an array of percpu unsigned integers, a boolean variable and a mutex. When we take the lock for read, we enter rcu read section, check for a "locked" variable. If it is false, we increase a percpu counter on the current cpu and exit the rcu section. If "locked" is true, we exit the rcu section, take the mutex and drop it (this waits until a writer finished) and retry. Unlocking for read just decreases percpu variable. Note that we can unlock on a difference cpu than where we locked, in this case the counter underflows. The sum of all percpu counters represents the number of processes that hold the lock for read. When we need to lock for write, we take the mutex, set "locked" variable to true and synchronize rcu. Since RCU has been synchronized, no processes can create new read locks. We wait until the sum of percpu counters is zero - when it is, there are no readers in the critical section. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e60bbd0225d5..24e1229cdfe0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -726,7 +727,7 @@ struct block_device { /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; /* A semaphore that prevents I/O while block size is being changed */ - struct rw_semaphore bd_block_size_semaphore; + struct percpu_rw_semaphore bd_block_size_semaphore; }; /* -- cgit v1.2.3 From c2b1ad800b66f62105a7fd250604d72e07202e66 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Sep 2012 09:35:03 +0200 Subject: fs: fix include/percpu-rwsem.h export error We get the following export error on the include file: usr/include/linux/fs.h:13: included file 'linux/percpu-rwsem.h' is not exported Move the include inside the __KERNEL__ section. Reported-by: Stephen Rothwell Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 24e1229cdfe0..cefa9645fc85 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -10,7 +10,6 @@ #include #include #include -#include /* * It's silly to have NR_OPEN bigger than NR_FILE, but you can change @@ -417,6 +416,7 @@ struct inodes_stat_t { #include #include #include +#include #include -- cgit v1.2.3