From a5166169f9b920cae3c503910cb66a3ac5dd846d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 12 Dec 2011 22:53:00 -0500 Subject: vfs: convert fs_supers to hlist Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e0bc4ffb8e7f..ed17e54fd204 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1440,7 +1440,7 @@ struct super_block { struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; - struct list_head s_instances; + struct hlist_node s_instances; struct quota_info s_dquot; /* Diskquota specific options */ int s_frozen; @@ -1864,7 +1864,7 @@ struct file_system_type { void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; - struct list_head fs_supers; + struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; -- cgit v1.2.3 From cf31e70d6cf93f19fe9bf1144966ef40991ac723 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 2 Jan 2012 22:28:36 -0500 Subject: vfs: new helper - vfs_ustat() ... and bury user_get_super()/statfs_by_dentry() - they are purely internal now. Signed-off-by: Al Viro --- include/linux/fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ed17e54fd204..cec429d76ab0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1939,7 +1939,7 @@ extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, extern int vfs_statfs(struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -extern int statfs_by_dentry(struct dentry *, struct kstatfs *); +extern int vfs_ustat(dev_t, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); extern bool our_mnt(struct vfsmount *mnt); @@ -2531,7 +2531,6 @@ extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); extern struct super_block *get_active_super(struct block_device *bdev); -extern struct super_block *user_get_super(dev_t); extern void drop_super(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern void iterate_supers_type(struct file_system_type *, -- cgit v1.2.3 From ff01bb4832651c6d25ac509a06a10fcbd75c461c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 16 Sep 2011 02:31:11 -0400 Subject: fs: move code out of buffer.c Move invalidate_bdev, block_sync_page into fs/block_dev.c. Export kill_bdev as well, so brd doesn't have to open code it. Reduce buffer_head.h requirement accordingly. Removed a rather large comment from invalidate_bdev, as it looked a bit obsolete to bother moving. The small comment replacing it says enough. Signed-off-by: Nick Piggin Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index cec429d76ab0..e853ba5eddd4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2092,6 +2092,7 @@ extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); +extern void kill_bdev(struct block_device *); extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); @@ -2099,6 +2100,7 @@ extern int fsync_bdev(struct block_device *); #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline void kill_bdev(struct block_device *bdev) {} static inline void invalidate_bdev(struct block_device *bdev) {} static inline struct super_block *freeze_bdev(struct block_device *sb) @@ -2415,6 +2417,7 @@ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync); +extern void block_sync_page(struct page *page); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, -- cgit v1.2.3 From 18bb1db3e7607e4a997d50991a6f9fa5b0f8722c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:41:39 -0400 Subject: switch vfs_mkdir() and ->mkdir() to umode_t vfs_mkdir() gets int, but immediately drops everything that might not fit into umode_t and that's the only caller of ->mkdir()... Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index cec429d76ab0..3f7bd8b12e37 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1517,7 +1517,7 @@ extern void unlock_super(struct super_block *); * VFS helper functions.. */ extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); -extern int vfs_mkdir(struct inode *, struct dentry *, int); +extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *); @@ -1623,7 +1623,7 @@ struct inode_operations { int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); - int (*mkdir) (struct inode *,struct dentry *,int); + int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct inode *,struct dentry *,int,dev_t); int (*rename) (struct inode *, struct dentry *, -- cgit v1.2.3 From 4acdaf27ebe2034c342f3be57ef49aed1ad885ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:42:34 -0400 Subject: switch ->create() to umode_t vfs_create() ignores everything outside of 16bit subset of its mode argument; switching it to umode_t is obviously equivalent and it's the only caller of the method Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f7bd8b12e37..e40321a6e239 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1516,7 +1516,7 @@ extern void unlock_super(struct super_block *); /* * VFS helper functions.. */ -extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); +extern int vfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); @@ -1619,7 +1619,7 @@ struct inode_operations { int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); - int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + int (*create) (struct inode *,struct dentry *,umode_t,struct nameidata *); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); -- cgit v1.2.3 From 1a67aafb5f72a436ca044293309fa7e6351d6a35 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 26 Jul 2011 01:52:52 -0400 Subject: switch ->mknod() to umode_t Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e40321a6e239..b89eef1d1752 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1518,7 +1518,7 @@ extern void unlock_super(struct super_block *); */ extern int vfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *); extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); -extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); +extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *); extern int vfs_rmdir(struct inode *, struct dentry *); @@ -1625,7 +1625,7 @@ struct inode_operations { int (*symlink) (struct inode *,struct dentry *,const char *); int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); - int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); void (*truncate) (struct inode *); -- cgit v1.2.3 From 62bb109170375f82eb3c51c8080b72954f02dca7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 23:20:18 -0400 Subject: switch inode_init_owner() to umode_t Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b89eef1d1752..9db9f6e6c98b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1534,7 +1534,7 @@ extern void dentry_unhash(struct dentry *dentry); * VFS file helper functions. */ extern void inode_init_owner(struct inode *inode, const struct inode *dir, - mode_t mode); + umode_t mode); /* * VFS FS_IOC_FIEMAP helper definitions. */ -- cgit v1.2.3 From 8d334acdd2c1f57c7a574c6f24d08e4c95582ff0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 24 Jul 2011 23:21:59 -0400 Subject: switch is_sxid() to umode_t Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9db9f6e6c98b..9d02fab420c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2690,7 +2690,7 @@ int __init get_filesystem_list(char *buf); #define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \ (flag & __FMODE_NONOTIFY))) -static inline int is_sxid(mode_t mode) +static inline int is_sxid(umode_t mode) { return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); } -- cgit v1.2.3 From a218d0fdc5f9004164ff151d274487f6799907d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 21 Nov 2011 14:59:34 -0500 Subject: switch open and mkdir syscalls to umode_t Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9d02fab420c6..f0e57b7e4297 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2054,8 +2054,8 @@ extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, extern int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern long do_sys_open(int dfd, const char __user *filename, int flags, - int mode); -extern struct file *filp_open(const char *, int, int); + umode_t mode); +extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, -- cgit v1.2.3 From 64132379d509184425672e0dce1ac0a031e3f2a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Dec 2011 20:51:13 -0500 Subject: vfs: switch ->show_stats to struct dentry * Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 659be7d82617..b2e4b6f639e4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1675,7 +1675,7 @@ struct super_operations { int (*show_options)(struct seq_file *, struct vfsmount *); int (*show_devname)(struct seq_file *, struct vfsmount *); int (*show_path)(struct seq_file *, struct vfsmount *); - int (*show_stats)(struct seq_file *, struct vfsmount *); + int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); -- cgit v1.2.3 From d861c630e99febe5ce6055290085556c5b714b06 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Dec 2011 21:32:45 -0500 Subject: vfs: switch ->show_devname() to struct dentry * Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b2e4b6f639e4..a8dff43d1b9d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1673,7 +1673,7 @@ struct super_operations { void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct vfsmount *); - int (*show_devname)(struct seq_file *, struct vfsmount *); + int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct vfsmount *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA -- cgit v1.2.3 From a6322de67b58a00e3a783ad9c87c2a11b2d67b47 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Dec 2011 21:37:57 -0500 Subject: vfs: switch ->show_path() to struct dentry * Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a8dff43d1b9d..13721b073407 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1674,7 +1674,7 @@ struct super_operations { int (*show_options)(struct seq_file *, struct vfsmount *); int (*show_devname)(struct seq_file *, struct dentry *); - int (*show_path)(struct seq_file *, struct vfsmount *); + int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); -- cgit v1.2.3 From 34c80b1d93e6e20ca9dea0baf583a5b5510d92d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Dec 2011 21:32:45 -0500 Subject: vfs: switch ->show_options() to struct dentry * Signed-off-by: Al Viro --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 13721b073407..cc1021fd19ef 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1672,7 +1672,7 @@ struct super_operations { int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); - int (*show_options)(struct seq_file *, struct vfsmount *); + int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); @@ -2592,7 +2592,7 @@ extern void setattr_copy(struct inode *inode, const struct iattr *attr); extern void file_update_time(struct file *file); -extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt); +extern int generic_show_options(struct seq_file *m, struct dentry *root); extern void save_mount_options(struct super_block *sb, char *options); extern void replace_mount_options(struct super_block *sb, char *options); -- cgit v1.2.3 From 39f7c4db1d2d9e2e2a90abdf34811783089d217d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Nov 2011 12:11:30 +0100 Subject: vfs: keep list of mounts for each superblock Keep track of vfsmounts belonging to a superblock. List is protected by vfsmount_lock. Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index cc1021fd19ef..03385acd71e8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1428,6 +1428,7 @@ struct super_block { #else struct list_head s_files; #endif + struct list_head s_mounts; /* list of mounts; _not_ for fs use */ /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ -- cgit v1.2.3 From 4ed5e82fe77f4147cf386327c9a63a2dd7eff518 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Nov 2011 12:11:31 +0100 Subject: vfs: protect remounting superblock read-only Currently remouting superblock read-only is racy in a major way. With the per mount read-only infrastructure it is now possible to prevent most races, which this patch attempts. Before starting the remount read-only, iterate through all mounts belonging to the superblock and if none of them have any pending writes, set sb->s_readonly_remount. This indicates that remount is in progress and no further write requests are allowed. If the remount succeeds set MS_RDONLY and reset s_readonly_remount. If the remounting is unsuccessful just reset s_readonly_remount. This can result in transient EROFS errors, despite the fact the remount failed. Unfortunately hodling off writes is difficult as remount itself may touch the filesystem (e.g. through load_nls()) which would deadlock. A later patch deals with delayed writes due to nlink going to zero. Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Al Viro --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 03385acd71e8..7b8a681b1ef4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1482,6 +1482,9 @@ struct super_block { int cleancache_poolid; struct shrinker s_shrink; /* per-sb shrinker handle */ + + /* Being remounted read-only */ + int s_readonly_remount; }; /* superblock cache pruning functions */ -- cgit v1.2.3 From 7ada4db88634429f4da690ad1c4eb73c93085f0c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Nov 2011 12:11:32 +0100 Subject: vfs: count unlinked inodes Add a new counter to the superblock that keeps track of unlinked but not yet deleted inodes. Do not WARN_ON if set_nlink is called with zero count, just do a ratelimited printk. This happens on xfs and probably other filesystems after an unclean shutdown when the filesystem reads inodes which already have zero i_nlink. Reported by Christoph Hellwig. Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Al Viro --- include/linux/fs.h | 61 +++++++----------------------------------------------- 1 file changed, 7 insertions(+), 54 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7b8a681b1ef4..8ac40921f5ac 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1483,6 +1483,9 @@ struct super_block { struct shrinker s_shrink; /* per-sb shrinker handle */ + /* Number of inodes with nlink == 0 but still referenced */ + atomic_long_t s_remove_count; + /* Being remounted read-only */ int s_readonly_remount; }; @@ -1768,31 +1771,10 @@ static inline void mark_inode_dirty_sync(struct inode *inode) __mark_inode_dirty(inode, I_DIRTY_SYNC); } -/** - * set_nlink - directly set an inode's link count - * @inode: inode - * @nlink: new nlink (should be non-zero) - * - * This is a low-level filesystem helper to replace any - * direct filesystem manipulation of i_nlink. - */ -static inline void set_nlink(struct inode *inode, unsigned int nlink) -{ - inode->__i_nlink = nlink; -} - -/** - * inc_nlink - directly increment an inode's link count - * @inode: inode - * - * This is a low-level filesystem helper to replace any - * direct filesystem manipulation of i_nlink. Currently, - * it is only here for parity with dec_nlink(). - */ -static inline void inc_nlink(struct inode *inode) -{ - inode->__i_nlink++; -} +extern void inc_nlink(struct inode *inode); +extern void drop_nlink(struct inode *inode); +extern void clear_nlink(struct inode *inode); +extern void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { @@ -1800,35 +1782,6 @@ static inline void inode_inc_link_count(struct inode *inode) mark_inode_dirty(inode); } -/** - * drop_nlink - directly drop an inode's link count - * @inode: inode - * - * This is a low-level filesystem helper to replace any - * direct filesystem manipulation of i_nlink. In cases - * where we are attempting to track writes to the - * filesystem, a decrement to zero means an imminent - * write when the file is truncated and actually unlinked - * on the filesystem. - */ -static inline void drop_nlink(struct inode *inode) -{ - inode->__i_nlink--; -} - -/** - * clear_nlink - directly zero an inode's link count - * @inode: inode - * - * This is a low-level filesystem helper to replace any - * direct filesystem manipulation of i_nlink. See - * drop_nlink() for why we care about i_nlink hitting zero. - */ -static inline void clear_nlink(struct inode *inode) -{ - inode->__i_nlink = 0; -} - static inline void inode_dec_link_count(struct inode *inode) { drop_nlink(inode); -- cgit v1.2.3 From 8e8b87964bc8dc5c14b6543fc933b7725f07d3ac Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 21 Nov 2011 12:11:33 +0100 Subject: vfs: prevent remount read-only if pending removes If there are any inodes on the super block that have been unlinked (i_nlink == 0) but have not yet been deleted then prevent the remounting the super block read-only. Reported-by: Toshiyuki Okajima Signed-off-by: Miklos Szeredi Tested-by: Toshiyuki Okajima Signed-off-by: Al Viro --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8ac40921f5ac..7aacf31418fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2150,8 +2150,6 @@ extern const struct file_operations read_pipefifo_fops; extern const struct file_operations write_pipefifo_fops; extern const struct file_operations rdwr_pipefifo_fops; -extern int fs_may_remount_ro(struct super_block *); - #ifdef CONFIG_BLOCK /* * return READ, READA, or WRITE -- cgit v1.2.3 From ef00f59c95fe6e002e7c6e3663cdea65e253f4cc Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 11 Jan 2012 16:29:31 +0100 Subject: block: Add BLKROTATIONAL ioctl Introduce an ioctl which permits applications to query whether a block device is rotational. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e0bc4ffb8e7f..95dd911506f1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -319,6 +319,7 @@ struct inodes_stat_t { #define BLKPBSZGET _IO(0x12,123) #define BLKDISCARDZEROES _IO(0x12,124) #define BLKSECDISCARD _IO(0x12,125) +#define BLKROTATIONAL _IO(0x12,126) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From 28d82dc1c4edbc352129f97f4ca22624d1fe61de Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 12 Jan 2012 17:17:43 -0800 Subject: epoll: limit paths The current epoll code can be tickled to run basically indefinitely in both loop detection path check (on ep_insert()), and in the wakeup paths. The programs that tickle this behavior set up deeply linked networks of epoll file descriptors that cause the epoll algorithms to traverse them indefinitely. A couple of these sample programs have been previously posted in this thread: https://lkml.org/lkml/2011/2/25/297. To fix the loop detection path check algorithms, I simply keep track of the epoll nodes that have been already visited. Thus, the loop detection becomes proportional to the number of epoll file descriptor and links. This dramatically decreases the run-time of the loop check algorithm. In one diabolical case I tried it reduced the run-time from 15 mintues (all in kernel time) to .3 seconds. Fixing the wakeup paths could be done at wakeup time in a similar manner by keeping track of nodes that have already been visited, but the complexity is harder, since there can be multiple wakeups on different cpus...Thus, I've opted to limit the number of possible wakeup paths when the paths are created. This is accomplished, by noting that the end file descriptor points that are found during the loop detection pass (from the newly added link), are actually the sources for wakeup events. I keep a list of these file descriptors and limit the number and length of these paths that emanate from these 'source file descriptors'. In the current implemetation I allow 1000 paths of length 1, 500 of length 2, 100 of length 3, 50 of length 4 and 10 of length 5. Note that it is sufficient to check the 'source file descriptors' reachable from the newly added link, since no other 'source file descriptors' will have newly added links. This allows us to check only the wakeup paths that may have gotten too long, and not re-check all possible wakeup paths on the system. In terms of the path limit selection, I think its first worth noting that the most common case for epoll, is probably the model where you have 1 epoll file descriptor that is monitoring n number of 'source file descriptors'. In this case, each 'source file descriptor' has a 1 path of length 1. Thus, I believe that the limits I'm proposing are quite reasonable and in fact may be too generous. Thus, I'm hoping that the proposed limits will not prevent any workloads that currently work to fail. In terms of locking, I have extended the use of the 'epmutex' to all epoll_ctl add and remove operations. Currently its only used in a subset of the add paths. I need to hold the epmutex, so that we can correctly traverse a coherent graph, to check the number of paths. I believe that this additional locking is probably ok, since its in the setup/teardown paths, and doesn't affect the running paths, but it certainly is going to add some extra overhead. Also, worth noting is that the epmuex was recently added to the ep_ctl add operations in the initial path loop detection code using the argument that it was not on a critical path. Another thing to note here, is the length of epoll chains that is allowed. Currently, eventpoll.c defines: /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 This basically means that I am limited to a graph depth of 5 (EP_MAX_NESTS + 1). However, this limit is currently only enforced during the loop check detection code, and only when the epoll file descriptors are added in a certain order. Thus, this limit is currently easily bypassed. The newly added check for wakeup paths, stricly limits the wakeup paths to a length of 5, regardless of the order in which ep's are linked together. Thus, a side-effect of the new code is a more consistent enforcement of the graph depth. Thus far, I've tested this, using the sample programs previously mentioned, which now either return quickly or return -EINVAL. I've also testing using the piptest.c epoll tester, which showed no difference in performance. I've also created a number of different epoll networks and tested that they behave as expectded. I believe this solves the original diabolical test cases, while still preserving the sane epoll nesting. Signed-off-by: Jason Baron Cc: Nelson Elhage Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7aacf31418fe..a7409bc157e0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1001,6 +1001,7 @@ struct file { #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; + struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; #ifdef CONFIG_DEBUG_WRITECOUNT -- cgit v1.2.3 From b969c4ab9f182a6e1b2a0848be349f99714947b0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:34 -0800 Subject: mm: compaction: determine if dirty pages can be migrated without blocking within ->migratepage Asynchronous compaction is used when allocating transparent hugepages to avoid blocking for long periods of time. Due to reports of stalling, there was a debate on disabling synchronous compaction but this severely impacted allocation success rates. Part of the reason was that many dirty pages are skipped in asynchronous compaction by the following check; if (PageDirty(page) && !sync && mapping->a_ops->migratepage != migrate_page) rc = -EBUSY; This skips over all mapping aops using buffer_migrate_page() even though it is possible to migrate some of these pages without blocking. This patch updates the ->migratepage callback with a "sync" parameter. It is the responsibility of the callback to fail gracefully if migration would block. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a7409bc157e0..b92b73d0b2b9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -609,9 +609,12 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **, unsigned long *); - /* migrate the contents of a page to the specified target */ + /* + * migrate the contents of a page to the specified target. If sync + * is false, it must not block. + */ int (*migratepage) (struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2537,7 +2540,7 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, bool); #else #define buffer_migrate_page NULL #endif -- cgit v1.2.3 From a6bc32b899223a877f595ef9ddc1e89ead5072b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Jan 2012 17:19:43 -0800 Subject: mm: compaction: introduce sync-light migration for use by compaction This patch adds a lightweight sync migrate operation MIGRATE_SYNC_LIGHT mode that avoids writing back pages to backing storage. Async compaction maps to MIGRATE_ASYNC while sync compaction maps to MIGRATE_SYNC_LIGHT. For other migrate_pages users such as memory hotplug, MIGRATE_SYNC is used. This avoids sync compaction stalling for an excessive length of time, particularly when copying files to a USB stick where there might be a large number of dirty pages backed by a filesystem that does not support ->writepages. [aarcange@redhat.com: This patch is heavily based on Andrea's work] [akpm@linux-foundation.org: fix fs/nfs/write.c build] [akpm@linux-foundation.org: fix fs/btrfs/disk-io.c build] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Dave Jones Cc: Jan Kara Cc: Andy Isaacson Cc: Nai Xia Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index b92b73d0b2b9..e694bd4434a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -525,6 +525,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +enum migrate_mode; struct iov_iter { const struct iovec *iov; @@ -614,7 +615,7 @@ struct address_space_operations { * is false, it must not block. */ int (*migratepage) (struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, enum migrate_mode); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); @@ -2540,7 +2541,8 @@ extern int generic_check_addressable(unsigned, u64); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *, bool); + struct page *, struct page *, + enum migrate_mode); #else #define buffer_migrate_page NULL #endif -- cgit v1.2.3 From 87192a2a49c475cf322cb143e0fa63b0102d8567 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Jan 2012 17:20:34 -0800 Subject: vfs: cache request_queue in struct block_device This makes it possible to get from the inode to the request_queue with one less cache miss. Used in followon optimization. The livetime of the pointer is the same as the gendisk. This assumes that the queue will always stay the same in the gendisk while it's visible to block_devices. I think that's safe correct? Signed-off-by: Andi Kleen Acked-by: Jeff Moyer Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e694bd4434a4..4bc8169fb5a1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -660,6 +660,7 @@ struct address_space { * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ +struct request_queue; struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ @@ -682,6 +683,7 @@ struct block_device { unsigned bd_part_count; int bd_invalidated; struct gendisk * bd_disk; + struct request_queue * bd_queue; struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device -- cgit v1.2.3