From e108526e77aa41c89b3be96f75d97615db2b751c Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 23 Jul 2008 21:26:44 -0700 Subject: move memory_read_from_buffer() from fs.h to string.h James Bottomley warns that inclusion of linux/fs.h in a low level driver was always a danger signal. This patch moves memory_read_from_buffer() from fs.h to string.h and fixes includes in existing memory_read_from_buffer() users. Signed-off-by: Akinobu Mita Cc: James Bottomley Cc: Geert Uytterhoeven Cc: Zhang Rui Cc: Bob Moore Cc: Thomas Renninger Cc: Len Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9c2ac5c0ef5c..ff54ae4933f3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2006,8 +2006,6 @@ extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); -extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, - const void *from, size_t available); #ifdef CONFIG_MIGRATION extern int buffer_migrate_page(struct address_space *, -- cgit v1.2.3 From da3bbdd4632c0171406b2677e31494afa5bde2f8 Mon Sep 17 00:00:00 2001 From: Kentaro Makita Date: Wed, 23 Jul 2008 21:27:13 -0700 Subject: fix soft lock up at NFS mount via per-SB LRU-list of unused dentries [Summary] Split LRU-list of unused dentries to one per superblock to avoid soft lock up during NFS mounts and remounting of any filesystem. Previously I posted here: http://lkml.org/lkml/2008/3/5/590 [Descriptions] - background dentry_unused is a list of dentries which are not referenced. dentry_unused grows up when references on directories or files are released. This list can be very long if there is huge free memory. - the problem When shrink_dcache_sb() is called, it scans all dentry_unused linearly under spin_lock(), and if dentry->d_sb is differnt from given superblock, scan next dentry. This scan costs very much if there are many entries, and very ineffective if there are many superblocks. IOW, When we need to shrink unused dentries on one dentry, but scans unused dentries on all superblocks in the system. For example, we scan 500 dentries to unmount a filesystem, but scans 1,000,000 or more unused dentries on other superblocks. In our case , At mounting NFS*, shrink_dcache_sb() is called to shrink unused dentries on NFS, but scans 100,000,000 unused dentries on superblocks in the system such as local ext3 filesystems. I hear NFS mounting took 1 min on some system in use. * : NFS uses virtual filesystem in rpc layer, so NFS is affected by this problem. 100,000,000 is possible number on large systems. Per-superblock LRU of unused dentried can reduce the cost in reasonable manner. - How to fix I found this problem is solved by David Chinner's "Per-superblock unused dentry LRU lists V3"(1), so I rebase it and add some fix to reclaim with fairness, which is in Andrew Morton's comments(2). 1) http://lkml.org/lkml/2006/5/25/318 2) http://lkml.org/lkml/2006/5/25/320 Split LRU-list of unused dentries to each superblocks. Then, NFS mounting will check dentries under a superblock instead of all. But this spliting will break LRU of dentry-unused. So, I've attempted to make reclaim unused dentrins with fairness by calculate number of dentries to scan on this sb based on following way number of dentries to scan on this sb = count * (number of dentries on this sb / number of dentries in the machine) - ToDo - I have to measuring performance number and do stress tests. - When unmount occurs during prune_dcache(), scanning on same superblock, It is unable to reach next superblock because it is gone away. We restart scannig superblock from first one, it causes unfairness of reclaim unused dentries on first superblock. But I think this happens very rarely. - Test Results Result on 6GB boxes with excessive unused dentries. Without patch: $ cat /proc/sys/fs/dentry-state 10181835 10180203 45 0 0 0 # mount -t nfs 10.124.60.70:/work/kernel-src nfs real 0m1.830s user 0m0.001s sys 0m1.653s With this patch: $ cat /proc/sys/fs/dentry-state 10236610 10234751 45 0 0 0 # mount -t nfs 10.124.60.70:/work/kernel-src nfs real 0m0.106s user 0m0.002s sys 0m0.032s [akpm@linux-foundation.org: fix comments] Signed-off-by: Kentaro Makita Cc: Neil Brown Cc: Trond Myklebust Cc: David Chinner Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index ff54ae4933f3..e5e6a244096c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1025,6 +1025,7 @@ extern int send_sigurg(struct fown_struct *fown); extern struct list_head super_blocks; extern spinlock_t sb_lock; +#define sb_entry(list) list_entry((list), struct super_block, s_list) #define S_BIAS (1<<30) struct super_block { struct list_head s_list; /* Keep this first */ @@ -1058,6 +1059,9 @@ struct super_block { struct list_head s_more_io; /* parked for more writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_files; + /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ + struct list_head s_dentry_lru; /* unused dentry lru */ + int s_nr_dentry_unused; /* # of dentry on lru */ struct block_device *s_bdev; struct mtd_info *s_mtd; -- cgit v1.2.3 From ed8cae8ba01348bfd83333f4648dd807b04d7f08 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:30 -0700 Subject: flag parameters: pipe This patch introduces the new syscall pipe2 which is like pipe but it also takes an additional parameter which takes a flag value. This patch implements the handling of O_CLOEXEC for the flag. I did not add support for the new syscall for the architectures which have a special sys_pipe implementation. I think the maintainers of those archs have the chance to go with the unified implementation but that's up to them. The implementation introduces do_pipe_flags. I did that instead of changing all callers of do_pipe because some of the callers are written in assembler. I would probably screw up changing the assembly code. To avoid breaking code do_pipe is now a small wrapper around do_pipe_flags. Once all callers are changed over to do_pipe_flags the old do_pipe function can be removed. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_pipe2 # ifdef __x86_64__ # define __NR_pipe2 293 # elif defined __i386__ # define __NR_pipe2 331 # else # error "need __NR_pipe2" # endif #endif int main (void) { int fd[2]; if (syscall (__NR_pipe2, fd, 0) != 0) { puts ("pipe2(0) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { printf ("pipe2(0) set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); if (syscall (__NR_pipe2, fd, O_CLOEXEC) != 0) { puts ("pipe2(O_CLOEXEC) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { printf ("pipe2(O_CLOEXEC) does not set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e5e6a244096c..0e80cd717d32 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1777,6 +1777,7 @@ static inline void allow_write_access(struct file *file) atomic_inc(&file->f_path.dentry->d_inode->i_writecount); } extern int do_pipe(int *); +extern int do_pipe_flags(int *, int); extern struct file *create_read_pipe(struct file *f); extern struct file *create_write_pipe(void); extern void free_write_pipe(struct file *); -- cgit v1.2.3 From be61a86d7237dd80510615f38ae21d6e1e98660c Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:40 -0700 Subject: flag parameters: NONBLOCK in pipe This patch adds O_NONBLOCK support to pipe2. It is minimally more involved than the patches for eventfd et.al but still trivial. The interfaces of the create_write_pipe and create_read_pipe helper functions were changed and the one other caller as well. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_pipe2 # ifdef __x86_64__ # define __NR_pipe2 293 # elif defined __i386__ # define __NR_pipe2 331 # else # error "need __NR_pipe2" # endif #endif int main (void) { int fds[2]; if (syscall (__NR_pipe2, fds, 0) == -1) { puts ("pipe2(0) failed"); return 1; } for (int i = 0; i < 2; ++i) { int fl = fcntl (fds[i], F_GETFL); if (fl == -1) { puts ("fcntl failed"); return 1; } if (fl & O_NONBLOCK) { printf ("pipe2(0) set non-blocking mode for fds[%d]\n", i); return 1; } close (fds[i]); } if (syscall (__NR_pipe2, fds, O_NONBLOCK) == -1) { puts ("pipe2(O_NONBLOCK) failed"); return 1; } for (int i = 0; i < 2; ++i) { int fl = fcntl (fds[i], F_GETFL); if (fl == -1) { puts ("fcntl failed"); return 1; } if ((fl & O_NONBLOCK) == 0) { printf ("pipe2(O_NONBLOCK) does not set non-blocking mode for fds[%d]\n", i); return 1; } close (fds[i]); } puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0e80cd717d32..4b86f806014c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1778,8 +1778,8 @@ static inline void allow_write_access(struct file *file) } extern int do_pipe(int *); extern int do_pipe_flags(int *, int); -extern struct file *create_read_pipe(struct file *f); -extern struct file *create_write_pipe(void); +extern struct file *create_read_pipe(struct file *f, int flags); +extern struct file *create_write_pipe(int flags); extern void free_write_pipe(struct file *); extern struct file *do_filp_open(int dfd, const char *pathname, -- cgit v1.2.3 From bde74e4bc64415b142e556a34d295a52a1b7da9d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 25 Jul 2008 01:48:57 -0700 Subject: locks: add special return value for asynchronous locks Use a special error value FILE_LOCK_DEFERRED to mean that a locking operation returned asynchronously. This is returned by posix_lock_file() for sleeping locks to mean that the lock has been queued on the block list, and will be woken up when it might become available and needs to be retried (either fl_lmops->fl_notify() is called or fl_wait is woken up). f_op->lock() to mean either the above, or that the filesystem will call back with fl_lmops->fl_grant() when the result of the locking operation is known. The filesystem can do this for sleeping as well as non-sleeping locks. This is to make sure, that return values of -EAGAIN and -EINPROGRESS by filesystems are not mistaken to mean an asynchronous locking. This also makes error handling in fs/locks.c and lockd/svclock.c slightly cleaner. Signed-off-by: Miklos Szeredi Cc: Trond Myklebust Cc: "J. Bruce Fields" Cc: Matthew Wilcox Cc: David Teigland Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4b86f806014c..49d8eb7a71be 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -885,6 +885,12 @@ static inline int file_check_writeable(struct file *filp) #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ +/* + * Special return value from posix_lock_file() and vfs_lock_file() for + * asynchronous locking. + */ +#define FILE_LOCK_DEFERRED 1 + /* * The POSIX file lock owner is determined by * the "struct files_struct" in the thread group -- cgit v1.2.3 From 19fd6231279be3c3bdd02ed99f9b0eb195978064 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:32 -0700 Subject: mm: spinlock tree_lock mapping->tree_lock has no read lockers. convert the lock from an rwlock to a spinlock. Signed-off-by: Nick Piggin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Hugh Dickins Cc: "Paul E. McKenney" Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 49d8eb7a71be..53d2edb709b3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -499,7 +499,7 @@ struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - rwlock_t tree_lock; /* and rwlock protecting it */ + spinlock_t tree_lock; /* and lock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ -- cgit v1.2.3 From d2d9648ec6858e19d16a0b16da62534e85888653 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 1 Jul 2008 14:16:09 +0200 Subject: [PATCH] reuse xxx_fifo_fops for xxx_pipe_fops Merge fifo and pipe file_operations. Signed-off-by: Denys Vlasenko Signed-off-by: Al Viro --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 53d2edb709b3..7721a2ac9c0e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1696,9 +1696,9 @@ extern void init_special_inode(struct inode *, umode_t, dev_t); extern void make_bad_inode(struct inode *); extern int is_bad_inode(struct inode *); -extern const struct file_operations read_fifo_fops; -extern const struct file_operations write_fifo_fops; -extern const struct file_operations rdwr_fifo_fops; +extern const struct file_operations read_pipefifo_fops; +extern const struct file_operations write_pipefifo_fops; +extern const struct file_operations rdwr_pipefifo_fops; extern int fs_may_remount_ro(struct super_block *); -- cgit v1.2.3 From e6305c43eda10ebfd2ad9e35d6e172ccc7bb3695 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 15 Jul 2008 21:03:57 -0400 Subject: [PATCH] sanitize ->permission() prototype * kill nameidata * argument; map the 3 bits in ->flags anybody cares about to new MAY_... ones and pass with the mask. * kill redundant gfs2_iop_permission() * sanitize ecryptfs_permission() * fix remaining places where ->permission() instances might barf on new MAY_... found in mask. The obvious next target in that direction is permission(9) folded fix for nfs_permission() breakage from Miklos Szeredi Signed-off-by: Al Viro --- include/linux/fs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7721a2ac9c0e..6c923c9b79bc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -60,6 +60,9 @@ extern int dir_notify_enable; #define MAY_WRITE 2 #define MAY_READ 4 #define MAY_APPEND 8 +#define MAY_ACCESS 16 +#define MAY_CHDIR 32 +#define MAY_OPEN 64 #define FMODE_READ 1 #define FMODE_WRITE 2 @@ -1272,7 +1275,7 @@ struct inode_operations { void * (*follow_link) (struct dentry *, struct nameidata *); void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int, struct nameidata *); + int (*permission) (struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); -- cgit v1.2.3 From 2f1936b87783a3a56c9441b27b9ba7a747f11e8e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 24 Jun 2008 16:50:14 +0200 Subject: [patch 3/5] vfs: change remove_suid() to file_remove_suid() All calls to remove_suid() are made with a file pointer, because (similarly to file_update_time) it is called when the file is written. Clean up callers by passing in a file instead of a dentry. Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c923c9b79bc..1a3546e69f9e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1834,7 +1834,7 @@ extern void clear_inode(struct inode *); extern void destroy_inode(struct inode *); extern struct inode *new_inode(struct super_block *); extern int should_remove_suid(struct dentry *); -extern int remove_suid(struct dentry *); +extern int file_remove_suid(struct file *); extern void __insert_inode_hash(struct inode *, unsigned long hashval); extern void remove_inode_hash(struct inode *); -- cgit v1.2.3 From db2e747b14991a4c6a5c98b0e5f552a193237c03 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 24 Jun 2008 16:50:16 +0200 Subject: [patch 5/5] vfs: remove mode parameter from vfs_symlink() Remove the unused mode parameter from vfs_symlink and callers. Thanks to Tetsuo Handa for noticing. CC: Tetsuo Handa Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a3546e69f9e..25998e803fc2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1139,7 +1139,7 @@ extern int vfs_permission(struct nameidata *, int); extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); extern int vfs_mkdir(struct inode *, struct dentry *, int); extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); -extern int vfs_symlink(struct inode *, struct dentry *, const char *, int); +extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *); -- cgit v1.2.3 From a110343f0d6d41f68b7cf8c00b57a3172c67f816 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 17 Jul 2008 09:19:08 -0400 Subject: [PATCH] fix MAY_CHDIR/MAY_ACCESS/LOOKUP_ACCESS mess * MAY_CHDIR is redundant - it's an equivalent of MAY_ACCESS * MAY_ACCESS on fuse should affect only the last step of pathname resolution * fchdir() and chroot() should pass MAY_ACCESS, for the same reason why chdir() needs that. * now that we pass MAY_ACCESS explicitly in all cases, LOOKUP_ACCESS can be removed; it has no business being in nameidata. Signed-off-by: Al Viro --- include/linux/fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 25998e803fc2..d8721e818b45 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -61,8 +61,7 @@ extern int dir_notify_enable; #define MAY_READ 4 #define MAY_APPEND 8 #define MAY_ACCESS 16 -#define MAY_CHDIR 32 -#define MAY_OPEN 64 +#define MAY_OPEN 32 #define FMODE_READ 1 #define FMODE_WRITE 2 -- cgit v1.2.3 From 9767d74957450da6365c363d69e3d02d605d7375 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Jul 2008 15:01:26 +0200 Subject: [patch 1/4] vfs: utimes: move owner check into inode_change_ok() Add a new ia_valid flag: ATTR_TIMES_SET, to handle the UTIMES_OMIT/UTIMES_NOW and UTIMES_NOW/UTIMES_OMIT cases. In these cases neither ATTR_MTIME_SET nor ATTR_ATIME_SET is in the flags, yet the POSIX draft specifies that permission checking is performed the same way as if one or both of the times was explicitly set to a timestamp. See the path "vfs: utimensat(): fix error checking for {UTIME_NOW,UTIME_OMIT} case" by Michael Kerrisk for the patch introducing this behavior. This is a cleanup, as well as allowing filesystems (NFS/fuse/...) to perform their own permission checking instead of the default. CC: Ulrich Drepper CC: Michael Kerrisk Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- include/linux/fs.h | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index d8721e818b45..527b9e482f99 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -320,22 +320,23 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, * Attribute flags. These should be or-ed together to figure out what * has been changed! */ -#define ATTR_MODE 1 -#define ATTR_UID 2 -#define ATTR_GID 4 -#define ATTR_SIZE 8 -#define ATTR_ATIME 16 -#define ATTR_MTIME 32 -#define ATTR_CTIME 64 -#define ATTR_ATIME_SET 128 -#define ATTR_MTIME_SET 256 -#define ATTR_FORCE 512 /* Not a change, but a change it */ -#define ATTR_ATTR_FLAG 1024 -#define ATTR_KILL_SUID 2048 -#define ATTR_KILL_SGID 4096 -#define ATTR_FILE 8192 -#define ATTR_KILL_PRIV 16384 -#define ATTR_OPEN 32768 /* Truncating from open(O_TRUNC) */ +#define ATTR_MODE (1 << 0) +#define ATTR_UID (1 << 1) +#define ATTR_GID (1 << 2) +#define ATTR_SIZE (1 << 3) +#define ATTR_ATIME (1 << 4) +#define ATTR_MTIME (1 << 5) +#define ATTR_CTIME (1 << 6) +#define ATTR_ATIME_SET (1 << 7) +#define ATTR_MTIME_SET (1 << 8) +#define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ +#define ATTR_ATTR_FLAG (1 << 10) +#define ATTR_KILL_SUID (1 << 11) +#define ATTR_KILL_SGID (1 << 12) +#define ATTR_FILE (1 << 13) +#define ATTR_KILL_PRIV (1 << 14) +#define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ +#define ATTR_TIMES_SET (1 << 16) /* * This is the Inode Attributes structure, used for notify_change(). It -- cgit v1.2.3 From f419a2e3b64def707e1384ee38abb77f99af5f6d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Jul 2008 00:07:17 -0400 Subject: [PATCH] kill nameidata passing to permission(), rename to inode_permission() Incidentally, the name that gives hundreds of false positives on grep is not a good idea... Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 527b9e482f99..9d2de4cadabd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1770,7 +1770,7 @@ extern int do_remount_sb(struct super_block *sb, int flags, extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); -extern int permission(struct inode *, int, struct nameidata *); +extern int inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int, int (*check_acl)(struct inode *, int)); -- cgit v1.2.3 From 516e0cc5646f377ab80fcc2ee639892eccb99853 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Jul 2008 00:39:17 -0400 Subject: [PATCH] f_count may wrap around make it atomic_long_t; while we are at it, get rid of useless checks in affs, hfs and hpfs - ->open() always has it equal to 1, ->release() - to 0. Signed-off-by: Al Viro --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9d2de4cadabd..7676fa1c20ae 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -795,7 +795,7 @@ struct file { #define f_dentry f_path.dentry #define f_vfsmnt f_path.mnt const struct file_operations *f_op; - atomic_t f_count; + atomic_long_t f_count; unsigned int f_flags; mode_t f_mode; loff_t f_pos; @@ -824,8 +824,8 @@ extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); #define file_list_unlock() spin_unlock(&files_lock); -#define get_file(x) atomic_inc(&(x)->f_count) -#define file_count(x) atomic_read(&(x)->f_count) +#define get_file(x) atomic_long_inc(&(x)->f_count) +#define file_count(x) atomic_long_read(&(x)->f_count) #ifdef CONFIG_DEBUG_WRITECOUNT static inline void file_take_write(struct file *f) -- cgit v1.2.3 From 3f8206d496e9e9495afb1d4e70d29712b4d403c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 26 Jul 2008 03:46:43 -0400 Subject: [PATCH] get rid of indirect users of namei.h fs.h needs path.h, not namei.h; nfs_fs.h doesn't need it at all. Several places in the tree needed direct include. Signed-off-by: Al Viro --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7676fa1c20ae..8252b045e624 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -279,7 +279,7 @@ extern int dir_notify_enable; #include #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 8ab22b9abb5c55413802e4adc9aa6223324547c3 Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Mon, 28 Jul 2008 15:46:36 -0700 Subject: vfs: pagecache usage optimization for pagesize!=blocksize When we read some part of a file through pagecache, if there is a pagecache of corresponding index but this page is not uptodate, read IO is issued and this page will be uptodate. I think this is good for pagesize == blocksize environment but there is room for improvement on pagesize != blocksize environment. Because in this case a page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate. So I suggest that when all buffers which correspond to a part of a file that we want to read are uptodate, use this pagecache and copy data from this pagecache to user buffer even if a page is not uptodate. This can reduce read IO and improve system throughput. I wrote a benchmark program and got result number with this program. This benchmark do: 1: mount and open a test file. 2: create a 512MB file. 3: close a file and umount. 4: mount and again open a test file. 5: pwrite randomly 300000 times on a test file. offset is aligned by IO size(1024bytes). 6: measure time of preading randomly 100000 times on a test file. The result was: 2.6.26 330 sec 2.6.26-patched 226 sec Arch:i386 Filesystem:ext3 Blocksize:1024 bytes Memory: 1GB On ext3/4, a file is written through buffer/block. So random read/write mixed workloads or random read after random write workloads are optimized with this patch under pagesize != blocksize environment. This test result showed this. The benchmark program is as follows: #include #include #include #include #include #include #include #include #include #define LEN 1024 #define LOOP 1024*512 /* 512MB */ main(void) { unsigned long i, offset, filesize; int fd; char buf[LEN]; time_t t1, t2; if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } memset(buf, 0, LEN); fd = open("/root/test1/testfile", O_CREAT|O_RDWR|O_TRUNC); if (fd < 0) { perror("cannot open file\n"); exit(1); } for (i = 0; i < LOOP; i++) write(fd, buf, LEN); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } if (mount("/dev/sda1", "/root/test1/", "ext3", 0, 0) < 0) { perror("cannot mount\n"); exit(1); } fd = open("/root/test1/testfile", O_RDWR); if (fd < 0) { perror("cannot open file\n"); exit(1); } filesize = LEN * LOOP; for (i = 0; i < 300000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pwrite(fd, buf, LEN, offset); } printf("start test\n"); time(&t1); for (i = 0; i < 100000; i++){ offset = (random() % filesize) & (~(LEN - 1)); pread(fd, buf, LEN, offset); } time(&t2); printf("%ld sec\n", t2-t1); close(fd); if (umount("/root/test1/") < 0) { perror("cannot umount\n"); exit(1); } } Signed-off-by: Hisashi Hifumi Cc: Nick Piggin Cc: Christoph Hellwig Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 8252b045e624..580b513668fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -443,6 +443,27 @@ static inline size_t iov_iter_count(struct iov_iter *i) return i->count; } +/* + * "descriptor" for what we're up to with a read. + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + union { + char __user *buf; + void *data; + } arg; + int error; +} read_descriptor_t; + +typedef int (*read_actor_t)(read_descriptor_t *, struct page *, + unsigned long, unsigned long); struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); @@ -484,6 +505,8 @@ struct address_space_operations { int (*migratepage) (struct address_space *, struct page *, struct page *); int (*launder_page) (struct page *); + int (*is_partially_uptodate) (struct page *, read_descriptor_t *, + unsigned long); }; /* @@ -1198,27 +1221,6 @@ struct block_device_operations { struct module *owner; }; -/* - * "descriptor" for what we're up to with a read. - * This allows us to use the same read code yet - * have multiple different users of the data that - * we read from a file. - * - * The simplest case just copies the data to user - * mode. - */ -typedef struct { - size_t written; - size_t count; - union { - char __user * buf; - void *data; - } arg; - int error; -} read_descriptor_t; - -typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); - /* These macros are for out of kernel modules to test that * the kernel supports the unlocked_ioctl and compat_ioctl * fields in struct file_operations. */ -- cgit v1.2.3