diff options
author | Alex Gonzalez <alex.gonzalez@digi.com> | 2012-01-19 13:54:23 +0100 |
---|---|---|
committer | Alex Gonzalez <alex.gonzalez@digi.com> | 2012-01-19 13:54:23 +0100 |
commit | 802699c91a967767fc94759f7a3e5e82d8269245 (patch) | |
tree | c8b714dd25edd333efbbf8bb1eb6c3d379084cc4 /fs | |
parent | f135e68daa6745fd3dbb285e6161ae2758c4027f (diff) | |
parent | 675f7660ffb0e1880011f6b3c4f9ac241491e3cd (diff) |
Merge commit 'v2.6.35.14' into del-5.8/main
Conflicts:
arch/arm/plat-mxc/include/mach/gpio.h
arch/x86/kernel/cpu/mtrr/main.c
drivers/mmc/core/core.c
drivers/net/smsc911x.c
fs/proc/task_mmu.c
include/linux/pm_runtime.h
mm/memory.c
mm/mlock.c
Signed-off-by: Alex Gonzalez <alex.gonzalez@digi.com>
Diffstat (limited to 'fs')
113 files changed, 1262 insertions, 563 deletions
@@ -512,7 +512,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) ctx->reqs_active--; if (unlikely(!ctx->reqs_active && ctx->dead)) - wake_up(&ctx->wait); + wake_up_all(&ctx->wait); } static void aio_fput_routine(struct work_struct *data) @@ -712,8 +712,16 @@ static ssize_t aio_run_iocb(struct kiocb *iocb) */ ret = retry(iocb); - if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) + if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { + /* + * There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || + ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) + ret = -EINTR; aio_complete(iocb, ret, 0); + } out: spin_lock_irq(&ctx->ctx_lock); @@ -1225,7 +1233,7 @@ static void io_destroy(struct kioctx *ioctx) * by other CPUs at this point. Right now, we rely on the * locking done by the above calls to ensure this consistency. */ - wake_up(&ioctx->wait); + wake_up_all(&ioctx->wait); put_ioctx(ioctx); /* once for the lookup */ } @@ -1659,6 +1667,9 @@ long do_io_submit(aio_context_t ctx_id, long nr, if (unlikely(nr < 0)) return -EINVAL; + if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) + nr = LONG_MAX/sizeof(*iocbpp); + if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) return -EFAULT; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index c4e83537ead7..42b60b04ea06 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -723,7 +723,7 @@ static int __init init_misc_binfmt(void) { int err = register_filesystem(&bm_fs_type); if (!err) { - err = register_binfmt(&misc_format); + err = insert_binfmt(&misc_format); if (err) unregister_filesystem(&bm_fs_type); } @@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio; + if (nr_iovecs > UIO_MAXIOV) + return NULL; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), gfp_mask); if (unlikely(!bio)) @@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd) static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); + struct bio_map_data *bmd; + if (iov_count > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; @@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; len += iov[i].iov_len; } @@ -954,6 +967,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; /* * buffer must be aligned to at least hardsector size for now @@ -981,7 +1000,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - + ret = get_user_pages_fast(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 99d6af811747..4c54c86e0289 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -681,8 +681,8 @@ retry: if (!bd_may_claim(bdev, whole, holder)) return -EBUSY; - /* if someone else is claiming, wait for it to finish */ - if (whole->bd_claiming && whole->bd_claiming != holder) { + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); @@ -1339,10 +1339,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) /* * hooks: /n/, see "layering violations". */ - ret = devcgroup_inode_permission(bdev->bd_inode, perm); - if (ret != 0) { - bdput(bdev); - return ret; + if (!for_part) { + ret = devcgroup_inode_permission(bdev->bd_inode, perm); + if (ret != 0) { + bdput(bdev); + return ret; + } } lock_kernel(); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 29c20092847e..e40aab413dfe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1212,6 +1212,8 @@ struct btrfs_root { #define BTRFS_INODE_NOATIME (1 << 9) #define BTRFS_INODE_DIRSYNC (1 << 10) +#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -2239,6 +2241,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); int btrfs_find_orphan_roots(struct btrfs_root *tree_root); int btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); + /* dir-item.c */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 34f7c375567e..9cd32dc22d8a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1127,8 +1127,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; + btrfs_check_and_init_root_item(&root->root_item); + } return root; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9254b3d58dbe..06cb01497a5d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -282,6 +282,10 @@ static noinline int create_subvol(struct btrfs_root *root, inode_item->nbytes = cpu_to_le64(root->leafsize); inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + root_item.flags = 0; + root_item.byte_limit = 0; + inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT); + btrfs_set_root_bytenr(&root_item, leaf->start); btrfs_set_root_generation(&root_item, trans->transid); btrfs_set_root_level(&root_item, 0); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 2d958be761c8..6794baf1d93f 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -473,3 +473,21 @@ again: btrfs_free_path(path); return 0; } + +/* + * Old btrfs forgets to init root_item->flags and root_item->byte_limit + * for subvolumes. To work around this problem, we steal a bit from + * root_item->inode_item->flags, and use it to indicate if those fields + * have been properly initialized. + */ +void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item) +{ + u64 inode_flags = le64_to_cpu(root_item->inode.flags); + + if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) { + inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT; + root_item->inode.flags = cpu_to_le64(inode_flags); + root_item->flags = 0; + root_item->byte_limit = 0; + } +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 66e4c66cc63b..7dc5b0d76a96 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -895,6 +895,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + btrfs_check_and_init_root_item(new_root_item); old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d6e3af8be95b..dc0dedfa8e0c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/capability.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -1909,6 +1910,9 @@ int btrfs_balance(struct btrfs_root *dev_root) if (dev_root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&dev_root->fs_info->volume_mutex); dev_root = dev_root->fs_info->dev_root; diff --git a/fs/char_dev.c b/fs/char_dev.c index d6db933df2b2..143d393881cb 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -20,6 +20,7 @@ #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> +#include <linux/tty.h> #include "internal.h" @@ -39,7 +40,9 @@ struct backing_dev_info directly_mappable_cdev_bdi = { #endif /* permit direct mmap, for read, write or exec */ BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP), + BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP | + /* no writeback happens */ + BDI_CAP_NO_ACCT_AND_WRITEBACK), }; static struct kobj_map *cdev_map; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index fb6318b81509..21088dc9f66f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -104,7 +104,8 @@ extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, extern struct cifsFileInfo *cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, struct file *file, - struct vfsmount *mnt, unsigned int oflags); + struct vfsmount *mnt, unsigned int oflags, + __u32 oplock); extern int cifs_posix_open(char *full_path, struct inode **pinode, struct super_block *sb, int mode, int oflags, @@ -342,7 +343,7 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const __u16 netfid, const __u64 len, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, - const bool waitFlag); + const bool waitFlag, const __u8 oplock_level); extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const int get_flag, const __u64 len, struct file_lock *, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c65c3419dd37..6b764d7505de 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1647,7 +1647,8 @@ int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const __u64 len, const __u64 offset, const __u32 numUnlock, - const __u32 numLock, const __u8 lockType, const bool waitFlag) + const __u32 numLock, const __u8 lockType, + const bool waitFlag, const __u8 oplock_level) { int rc = 0; LOCK_REQ *pSMB = NULL; @@ -1675,6 +1676,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, pSMB->NumberOfLocks = cpu_to_le16(numLock); pSMB->NumberOfUnlocks = cpu_to_le16(numUnlock); pSMB->LockType = lockType; + pSMB->OplockLevel = oplock_level; pSMB->AndXCommand = 0xFF; /* none */ pSMB->Fid = smb_file_id; /* netfid stays le */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2208f06e4c45..79168612049a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -801,8 +801,7 @@ static int cifs_parse_mount_options(char *options, const char *devname, struct smb_vol *vol) { - char *value; - char *data; + char *value, *data, *end; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -845,6 +844,7 @@ cifs_parse_mount_options(char *options, const char *devname, if (!options) return 1; + end = options + strlen(options); if (strncmp(options, "sep=", 4) == 0) { if (options[4] != 0) { separator[0] = options[4]; @@ -909,6 +909,7 @@ cifs_parse_mount_options(char *options, const char *devname, the only illegal character in a password is null */ if ((value[temp_len] == 0) && + (value + temp_len < end) && (value[temp_len+1] == separator[0])) { /* reinsert comma */ value[temp_len] = separator[0]; @@ -1647,9 +1648,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses) { cFYI(1, "Existing smb sess found (status=%d)", ses->status); - /* existing SMB ses has a server reference already */ - cifs_put_tcp_session(server); - mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); if (rc) { @@ -1672,6 +1670,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) } } mutex_unlock(&ses->session_mutex); + + /* existing SMB ses has a server reference already */ + cifs_put_tcp_session(server); FreeXid(xid); return ses; } @@ -2422,6 +2423,11 @@ is_path_accessible(int xid, struct cifsTconInfo *tcon, 0 /* not legacy */, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc == -EOPNOTSUPP || rc == -EINVAL) + rc = SMBQueryInformation(xid, tcon, full_path, pfile_info, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); kfree(pfile_info); return rc; } @@ -2606,7 +2612,7 @@ try_mount_again: remote_path_check: /* check if a whole path (including prepath) is not remote */ - if (!rc && cifs_sb->prepathlen && tcon) { + if (!rc && tcon) { /* build_path_to_root works only when we have a valid tcon */ full_path = cifs_build_path_to_root(cifs_sb); if (full_path == NULL) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index e7ae78b66fa1..9c338780fbab 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -138,9 +138,9 @@ cifs_bp_rename_retry: */ struct cifsFileInfo * cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, - struct file *file, struct vfsmount *mnt, unsigned int oflags) + struct file *file, struct vfsmount *mnt, unsigned int oflags, + __u32 oplock) { - int oplock = 0; struct cifsFileInfo *pCifsFile; struct cifsInodeInfo *pCifsInode; struct cifs_sb_info *cifs_sb = CIFS_SB(mnt->mnt_sb); @@ -149,9 +149,6 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, if (pCifsFile == NULL) return pCifsFile; - if (oplockEnabled) - oplock = REQ_OPLOCK; - pCifsFile->netfid = fileHandle; pCifsFile->pid = current->tgid; pCifsFile->pInode = igrab(newinode); @@ -476,7 +473,7 @@ cifs_create_set_dentry: } pfile_info = cifs_new_fileinfo(newinode, fileHandle, filp, - nd->path.mnt, oflags); + nd->path.mnt, oflags, oplock); if (pfile_info == NULL) { fput(filp); CIFSSMBClose(xid, tcon, fileHandle); @@ -738,7 +735,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, cfile = cifs_new_fileinfo(newInode, fileHandle, filp, nd->path.mnt, - nd->intent.open.flags); + nd->intent.open.flags, + oplock); if (cfile == NULL) { fput(filp); CIFSSMBClose(xid, pTcon, fileHandle); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 409e4f523e61..8a9e6888c041 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -277,7 +277,7 @@ int cifs_open(struct inode *inode, struct file *file) pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, - oflags); + oflags, oplock); if (pCifsFile == NULL) { CIFSSMBClose(xid, tcon, netfid); rc = -ENOMEM; @@ -367,7 +367,7 @@ int cifs_open(struct inode *inode, struct file *file) goto out; pCifsFile = cifs_new_fileinfo(inode, netfid, file, file->f_path.mnt, - file->f_flags); + file->f_flags, oplock); if (pCifsFile == NULL) { rc = -ENOMEM; goto out; @@ -796,12 +796,12 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) /* BB we could chain these into one lock request BB */ rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, - 0, 1, lockType, 0 /* wait flag */ ); + 0, 1, lockType, 0 /* wait flag */, 0); if (rc == 0) { rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 1 /* numUnlock */ , 0 /* numLock */ , lockType, - 0 /* wait flag */ ); + 0 /* wait flag */, 0); pfLock->fl_type = F_UNLCK; if (rc != 0) cERROR(1, "Error unlocking previously locked " @@ -818,13 +818,13 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 0, 1, lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */); + 0 /* wait flag */, 0); if (rc == 0) { rc = CIFSSMBLock(xid, tcon, netfid, length, pfLock->fl_start, 1, 0, lockType | LOCKING_ANDX_SHARED_LOCK, - 0 /* wait flag */); + 0 /* wait flag */, 0); pfLock->fl_type = F_RDLCK; if (rc != 0) cERROR(1, "Error unlocking " @@ -868,8 +868,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) if (numLock) { rc = CIFSSMBLock(xid, tcon, netfid, length, - pfLock->fl_start, - 0, numLock, lockType, wait_flag); + pfLock->fl_start, 0, numLock, lockType, + wait_flag, 0); if (rc == 0) { /* For Windows locks we must store them. */ @@ -889,9 +889,9 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) (pfLock->fl_start + length) >= (li->offset + li->length)) { stored_rc = CIFSSMBLock(xid, tcon, - netfid, - li->length, li->offset, - 1, 0, li->type, false); + netfid, li->length, + li->offset, 1, 0, + li->type, false, 0); if (stored_rc) rc = stored_rc; else { @@ -2300,7 +2300,8 @@ cifs_oplock_break(struct slow_work *work) */ if (!cfile->closePend && !cfile->oplock_break_cancelled) { rc = CIFSSMBLock(0, cifs_sb->tcon, cfile->netfid, 0, 0, 0, 0, - LOCKING_ANDX_OPLOCK_RELEASE, false); + LOCKING_ANDX_OPLOCK_RELEASE, false, + cinode->clientCanCacheRead ? 1 : 0); cFYI(1, "Oplock release rc = %d", rc); } } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6f0683c68952..b5dbf82762de 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -804,8 +804,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); - if (!inode) - return ERR_PTR(-ENOMEM); + if (!inode) { + inode = ERR_PTR(rc); + goto out; + } if (rc && cifs_sb->tcon->ipc) { cFYI(1, "ipc connection - fake read inode"); @@ -816,13 +818,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) inode->i_uid = cifs_sb->mnt_uid; inode->i_gid = cifs_sb->mnt_gid; } else if (rc) { - kfree(full_path); - _FreeXid(xid); iget_failed(inode); - return ERR_PTR(rc); + inode = ERR_PTR(rc); } - +out: kfree(full_path); /* can not call macro FreeXid here since in a void func * TODO: This is no longer true diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 0a57cb7db5dd..42873c14dd91 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -913,7 +913,9 @@ ssetup_ntlmssp_authenticate: } /* BB check if Unicode and decode strings */ - if (smb_buf->Flags2 & SMBFLG2_UNICODE) { + if (bytes_remaining == 0) { + /* no string area to decode, do nothing */ + } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { /* unicode string area must be word-aligned */ if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { ++bcc_ptr; diff --git a/fs/compat.c b/fs/compat.c index 6490d2134ff3..34bf9fcac319 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1150,7 +1150,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, { compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; - struct iovec *iov; + struct iovec *iov = iovstack; ssize_t ret; io_fn_t fn; iov_fn_t fnv; @@ -1376,6 +1376,10 @@ static int compat_count(compat_uptr_t __user *argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); } } return i; @@ -1417,6 +1421,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -1433,18 +1443,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; -#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } -#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { - /* We've exceed the stack rlimit. */ + page = get_arg_page(bprm, pos, 1); + if (!page) { ret = -E2BIG; goto out; } @@ -1565,8 +1565,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) + if (bprm->mm) { + acct_arg_size(bprm, 0); mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/dcache.c b/fs/dcache.c index 86d4db15473e..c1f86c734540 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1186,9 +1186,12 @@ struct dentry *d_obtain_alias(struct inode *inode) spin_unlock(&tmp->d_lock); spin_unlock(&dcache_lock); + security_d_instantiate(tmp, inode); return tmp; out_iput: + if (res && !IS_ERR(res)) + security_d_instantiate(res, inode); iput(inode); return res; } diff --git a/fs/dcookies.c b/fs/dcookies.c index a21cabdbd87b..dda0dc702d1b 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -178,6 +178,8 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) /* FIXME: (deleted) ? */ path = d_path(&dcs->path, kbuf, PAGE_SIZE); + mutex_unlock(&dcookie_mutex); + if (IS_ERR(path)) { err = PTR_ERR(path); goto out_free; @@ -194,6 +196,7 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) out_free: kfree(kbuf); + return err; out: mutex_unlock(&dcookie_mutex); return err; diff --git a/fs/direct-io.c b/fs/direct-io.c index 7600aacf531d..458fdd360dda 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret) +static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) { ssize_t transferred = 0; @@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) transferred = dio->i_size - offset; } - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private); - - if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (ret == 0) ret = transferred; + if (dio->end_io && dio->result) { + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->flags & DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + return ret; } @@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - int ret = dio_complete(dio, dio->iocb->ki_pos, 0); - aio_complete(dio->iocb, ret, 0); + dio_complete(dio, dio->iocb->ki_pos, 0, true); kfree(dio); } } @@ -632,7 +634,7 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { - loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t cur_offset = dio->cur_page_fs_offset; loff_t bio_next_offset = dio->logical_offset_in_bio + dio->bio->bi_size; @@ -657,7 +659,7 @@ static int dio_send_cur_page(struct dio *dio) * Submit now if the underlying fs is about to perform a * metadata read */ - if (dio->boundary) + else if (dio->boundary) dio_bio_submit(dio); } @@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_unlock_irqrestore(&dio->bio_lock, flags); if (ret2 == 0) { - ret = dio_complete(dio, offset, ret); + ret = dio_complete(dio, offset, ret, false); kfree(dio); } else BUG_ON(ret != -EIOCBQUEUED); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index e8fcf4e2ed7d..622c95140802 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -199,7 +199,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) "the persistent file for the dentry with name " "[%s]; rc = [%d]\n", __func__, ecryptfs_dentry->d_name.name, rc); - goto out; + goto out_free; } } if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) @@ -207,7 +207,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) rc = -EPERM; printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " "file must hence be opened RO\n", __func__); - goto out; + goto out_free; } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); @@ -292,12 +292,40 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag) return rc; } -static int ecryptfs_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); +static long +ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOTTY; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl) + rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); + return rc; +} + +#ifdef CONFIG_COMPAT +static long +ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct file *lower_file = NULL; + long rc = -ENOIOCTLCMD; + + if (ecryptfs_file_to_private(file)) + lower_file = ecryptfs_file_to_lower(file); + if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl) + rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); + return rc; +} +#endif const struct file_operations ecryptfs_dir_fops = { .readdir = ecryptfs_readdir, - .ioctl = ecryptfs_ioctl, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, @@ -313,7 +341,10 @@ const struct file_operations ecryptfs_main_fops = { .write = do_sync_write, .aio_write = generic_file_aio_write, .readdir = ecryptfs_readdir, - .ioctl = ecryptfs_ioctl, + .unlocked_ioctl = ecryptfs_unlocked_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ecryptfs_compat_ioctl, +#endif .mmap = generic_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, @@ -322,20 +353,3 @@ const struct file_operations ecryptfs_main_fops = { .fasync = ecryptfs_fasync, .splice_read = generic_file_splice_read, }; - -static int -ecryptfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, - unsigned long arg) -{ - int rc = 0; - struct file *lower_file = NULL; - - if (ecryptfs_file_to_private(file)) - lower_file = ecryptfs_file_to_lower(file); - if (lower_file && lower_file->f_op && lower_file->f_op->ioctl) - rc = lower_file->f_op->ioctl(ecryptfs_inode_to_lower(inode), - lower_file, cmd, arg); - else - rc = -ENOTTY; - return rc; -} diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 31ef5252f0fe..1681c620884b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -32,6 +32,7 @@ #include <linux/crypto.h> #include <linux/fs_stack.h> #include <linux/slab.h> +#include <linux/xattr.h> #include <asm/unaligned.h> #include "ecryptfs_kernel.h" @@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); struct dentry *dentry_save; struct vfsmount *vfsmount_save; + unsigned int flags_save; int rc; dentry_save = nd->path.dentry; vfsmount_save = nd->path.mnt; + flags_save = nd->flags; nd->path.dentry = lower_dentry; nd->path.mnt = lower_mnt; + nd->flags &= ~LOOKUP_OPEN; rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); nd->path.dentry = dentry_save; nd->path.mnt = vfsmount_save; + nd->flags = flags_save; return rc; } @@ -264,7 +269,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, printk(KERN_ERR "%s: Out of memory whilst attempting " "to allocate ecryptfs_dentry_info struct\n", __func__); - goto out_dput; + goto out_put; } ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); @@ -339,8 +344,9 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, out_free_kmem: kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; -out_dput: +out_put: dput(lower_dentry); + mntput(lower_mnt); d_drop(ecryptfs_dentry); out: return rc; @@ -997,6 +1003,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), ecryptfs_dentry_to_lower(dentry), &lower_stat); if (!rc) { + fsstack_copy_attr_all(dentry->d_inode, + ecryptfs_inode_to_lower(dentry->d_inode)); generic_fillattr(dentry->d_inode, stat); stat->blocks = lower_stat.blocks; } @@ -1015,10 +1023,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value, - size, flags); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + + rc = vfs_setxattr(lower_dentry, name, value, size, flags); out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 89c5476506ef..07f23c508192 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -482,8 +482,8 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct mutex *tfm_mutex; char *block_aligned_filename; struct ecryptfs_auth_tok *auth_tok; - struct scatterlist src_sg; - struct scatterlist dst_sg; + struct scatterlist src_sg[2]; + struct scatterlist dst_sg[2]; struct blkcipher_desc desc; char iv[ECRYPTFS_MAX_IV_BYTES]; char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; @@ -696,23 +696,21 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename, filename_size); rc = virt_to_scatterlist(s->block_aligned_filename, - s->block_aligned_filename_size, &s->src_sg, 1); - if (rc != 1) { + s->block_aligned_filename_size, s->src_sg, 2); + if (rc < 1) { printk(KERN_ERR "%s: Internal error whilst attempting to " - "convert filename memory to scatterlist; " - "expected rc = 1; got rc = [%d]. " + "convert filename memory to scatterlist; rc = [%d]. " "block_aligned_filename_size = [%zd]\n", __func__, rc, s->block_aligned_filename_size); goto out_release_free_unlock; } rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, - &s->dst_sg, 1); - if (rc != 1) { + s->dst_sg, 2); + if (rc < 1) { printk(KERN_ERR "%s: Internal error whilst attempting to " "convert encrypted filename memory to scatterlist; " - "expected rc = 1; got rc = [%d]. " - "block_aligned_filename_size = [%zd]\n", __func__, rc, - s->block_aligned_filename_size); + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); goto out_release_free_unlock; } /* The characters in the first block effectively do the job @@ -735,7 +733,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, mount_crypt_stat->global_default_fn_cipher_key_bytes); goto out_release_free_unlock; } - rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + rc = crypto_blkcipher_encrypt_iv(&s->desc, s->dst_sg, s->src_sg, s->block_aligned_filename_size); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt filename; " @@ -767,8 +765,8 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { struct mutex *tfm_mutex; char *decrypted_filename; struct ecryptfs_auth_tok *auth_tok; - struct scatterlist src_sg; - struct scatterlist dst_sg; + struct scatterlist src_sg[2]; + struct scatterlist dst_sg[2]; struct blkcipher_desc desc; char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; char iv[ECRYPTFS_MAX_IV_BYTES]; @@ -873,13 +871,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, } mutex_lock(s->tfm_mutex); rc = virt_to_scatterlist(&data[(*packet_size)], - s->block_aligned_filename_size, &s->src_sg, 1); - if (rc != 1) { + s->block_aligned_filename_size, s->src_sg, 2); + if (rc < 1) { printk(KERN_ERR "%s: Internal error whilst attempting to " "convert encrypted filename memory to scatterlist; " - "expected rc = 1; got rc = [%d]. " - "block_aligned_filename_size = [%zd]\n", __func__, rc, - s->block_aligned_filename_size); + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); goto out_unlock; } (*packet_size) += s->block_aligned_filename_size; @@ -893,13 +890,12 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, goto out_unlock; } rc = virt_to_scatterlist(s->decrypted_filename, - s->block_aligned_filename_size, &s->dst_sg, 1); - if (rc != 1) { + s->block_aligned_filename_size, s->dst_sg, 2); + if (rc < 1) { printk(KERN_ERR "%s: Internal error whilst attempting to " "convert decrypted filename memory to scatterlist; " - "expected rc = 1; got rc = [%d]. " - "block_aligned_filename_size = [%zd]\n", __func__, rc, - s->block_aligned_filename_size); + "rc = [%d]. block_aligned_filename_size = [%zd]\n", + __func__, rc, s->block_aligned_filename_size); goto out_free_unlock; } /* The characters in the first block effectively do the job of @@ -938,7 +934,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, mount_crypt_stat->global_default_fn_cipher_key_bytes); goto out_free_unlock; } - rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + rc = crypto_blkcipher_decrypt_iv(&s->desc, s->dst_sg, s->src_sg, s->block_aligned_filename_size); if (rc) { printk(KERN_ERR "%s: Error attempting to decrypt filename; " @@ -1543,6 +1539,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, printk(KERN_ERR "Could not find key with description: [%s]\n", sig); rc = process_request_key_err(PTR_ERR(*auth_tok_key)); + (*auth_tok_key) = NULL; goto out; } (*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index b1d82756544b..b530d9cd5bf0 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -368,6 +368,11 @@ static int ecryptfs_write_begin(struct file *file, && (pos != 0)) zero_user(page, 0, PAGE_CACHE_SIZE); out: + if (unlikely(rc)) { + unlock_page(page); + page_cache_release(page); + *pagep = NULL; + } return rc; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3817149919cb..25bcbd19f5eb 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -63,6 +63,13 @@ * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). + * It is also acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two + * simultaneous inserts (A into B and B into A) from racing and + * constructing a cycle without either insert observing that it is + * going to. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -227,6 +234,9 @@ static int max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +/* Used to check for epoll file descriptor inclusion loops */ +static struct nested_calls poll_loop_ncalls; + /* Used for safe wake up implementation */ static struct nested_calls poll_safewake_ncalls; @@ -1181,6 +1191,62 @@ retry: return res; } +/** + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() + * API, to verify that adding an epoll file inside another + * epoll structure, does not violate the constraints, in + * terms of closed loops, or too deep chains (which can + * result in excessive stack usage). + * + * @priv: Pointer to the epoll file to be currently checked. + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll + * data structure pointer. + * @call_nests: Current dept of the @ep_call_nested() call stack. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, epi->ffd.file, + epi->ffd.file->private_data, current); + if (error != 0) + break; + } + } + mutex_unlock(&ep->mtx); + + return error; +} + +/** + * ep_loop_check - Performs a check to verify that adding an epoll file (@file) + * another epoll file (represented by @ep) does not create + * closed loops or too deep chains. + * + * @ep: Pointer to the epoll private data structure. + * @file: Pointer to the epoll file to be checked. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check(struct eventpoll *ep, struct file *file) +{ + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); +} + /* * Open an eventpoll file descriptor. */ @@ -1229,6 +1295,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; + int did_lock_epmutex = 0; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; @@ -1270,6 +1337,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ ep = file->private_data; + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are + * better be handled here, than in more critical paths. + * + * We hold epmutex across the loop check and the insert in this case, in + * order to prevent two separate inserts from racing and each doing the + * insert "at the same time" such that ep_loop_check passes on both + * before either one does the insert, thereby creating a cycle. + */ + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + mutex_lock(&epmutex); + did_lock_epmutex = 1; + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } + + mutex_lock(&ep->mtx); /* @@ -1305,6 +1391,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: + if (unlikely(did_lock_epmutex)) + mutex_unlock(&epmutex); + fput(tfile); error_fput: fput(file); @@ -1423,6 +1512,12 @@ static int __init eventpoll_init(void) max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; + /* + * Initialize the structure used to perform epoll file descriptor + * inclusion loops checks. + */ + ep_nested_calls_init(&poll_loop_ncalls); + /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); diff --git a/fs/exec.c b/fs/exec.c index e19de6a80339..34c671e4e4e3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -158,7 +158,22 @@ out: #ifdef CONFIG_MMU -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + + down_write(&mm->mmap_sem); + mm->total_vm += diff; + up_write(&mm->mmap_sem); +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -180,6 +195,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; struct rlimit *rlim; + acct_arg_size(bprm, size / PAGE_SIZE); + /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks @@ -248,6 +265,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); + + err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (err) + goto err; + err = insert_vm_struct(mm, vma); if (err) goto err; @@ -270,7 +292,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -377,6 +403,9 @@ static int count(char __user * __user * argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; cond_resched(); } } @@ -420,6 +449,12 @@ static int copy_strings(int argc, char __user * __user * argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -594,6 +629,11 @@ int setup_arg_pages(struct linux_binprm *bprm, #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); + + if (unlikely(stack_top < mmap_min_addr) || + unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) + return -ENOMEM; + stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; @@ -977,12 +1017,14 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ + acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; /* We're using it now */ + set_fs(USER_DS); current->flags &= ~PF_RANDOMIZE; flush_thread(); current->personality &= ~bprm->per_clear; @@ -1247,10 +1289,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) if (retval) return retval; - /* kernel module loader fixup */ - /* so we don't try to load run modprobe in kernel space. */ - set_fs(USER_DS); - retval = audit_bprm(bprm); if (retval) return retval; @@ -1401,8 +1439,10 @@ int do_execve(char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 71efb0e9a3f2..b5d9028af770 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); ext2_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (new_dir->i_nlink >= EXT2_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); err = ext2_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } @@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, /* * Like most other Unix systems, set the ctime for inodes on a * rename. - * inode_dec_link_count() will mark the inode dirty. */ old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); ext2_delete_entry (old_de, old_page); - inode_dec_link_count(old_inode); if (dir_de) { if (old_dir != new_dir) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index ee184084ca42..7eecac016e3f 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1425,10 +1425,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -1550,8 +1559,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; node2 = (struct dx_node *)(bh2->b_data); entries2 = node2->entries; + memset(&node2->fake, 0, sizeof(struct fake_dirent)); node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); - node2->fake.inode = 0; BUFFER_TRACE(frame->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, frame->bh); if (err) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6c953bb255e7..6c7efb689062 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1459,6 +1459,13 @@ static void ext3_orphan_cleanup (struct super_block * sb, return; } + /* Check if feature set allows readwrite operations */ + if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) { + printk(KERN_INFO "EXT3-fs: %s: Skipping orphan cleanup due to " + "unknown ROCOMPAT features\n", sb->s_id); + return; + } + if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { if (es->s_last_orphan) jbd_debug(1, "Errors on filesystem, " diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 19a4de57128a..4c9f05dc7041 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -167,13 +167,15 @@ struct mpage_da_data { }; #define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { - struct list_head list; /* per-file finished AIO list */ + struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ struct page *page; /* page struct for buffer write */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ } ext4_io_end_t; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 42272d67955a..0595b8cf0178 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3668,6 +3668,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) return ret; } + if (io->iocb) + aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ io->flag = 0; return ret; @@ -3767,6 +3769,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) io->offset = 0; io->size = 0; io->page = NULL; + io->iocb = NULL; + io->result = 0; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3775,7 +3779,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) + ssize_t size, void *private, int ret, + bool is_async) { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; @@ -3784,7 +3789,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) - return; + goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", @@ -3795,12 +3800,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); return; } io_end->offset = offset; io_end->size = size; - io_end->flag = EXT4_IO_UNWRITTEN; + if (is_async) { + io_end->iocb = iocb; + io_end->result = ret; + } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ @@ -5582,13 +5593,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, /* if nrblocks are contiguous */ if (chunk) { /* - * With N contiguous data blocks, it need at most - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks - * 2 dindirect blocks - * 1 tindirect block + * With N contiguous data blocks, we need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, + * 2 dindirect blocks, and 1 tindirect block */ - indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); - return indirects + 3; + return DIV_ROUND_UP(nrblocks, + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; } /* * if nrblocks are not contiguous, worse case, each block touch diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 12b3bc026a68..994a719b995c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1175,6 +1175,8 @@ repeat_load_buddy: return 0; err: + if (page) + page_cache_release(page); if (e4b->bd_bitmap_page) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4e8983a9811b..a45ced96b042 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -241,7 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ @@ -3491,7 +3491,7 @@ int ext4_force_commit(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; if (journal) { - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); ret = ext4_journal_force_commit(journal); } diff --git a/fs/fat/file.c b/fs/fat/file.c index 990dfae022e5..f2880571b3b0 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -102,7 +102,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) if (attr & ATTR_SYS) inode->i_flags |= S_IMMUTABLE; else - inode->i_flags &= S_IMMUTABLE; + inode->i_flags &= ~S_IMMUTABLE; } fat_save_attrs(inode, attr); diff --git a/fs/file_table.c b/fs/file_table.c index 5c7d10ead4ad..684ffe3c97ac 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -123,13 +123,13 @@ struct file *get_empty_filp(void) goto fail; percpu_counter_inc(&nr_files); + f->f_cred = get_cred(cred); if (security_file_alloc(f)) goto fail_sec; INIT_LIST_HEAD(&f->f_u.fu_list); atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); - f->f_cred = get_cred(cred); spin_lock_init(&f->f_lock); eventpoll_init_file(f); /* f->f_version: 0 */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9424796d6634..e5cdabf4945e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1552,6 +1552,14 @@ __acquires(&fc->lock) } } +static void end_queued_requests(struct fuse_conn *fc) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); +} + /* * Abort all requests. * @@ -1578,8 +1586,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; end_io_requests(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + end_queued_requests(fc); wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -1594,8 +1601,9 @@ int fuse_dev_release(struct inode *inode, struct file *file) if (fc) { spin_lock(&fc->lock); fc->connected = 0; - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + fc->blocked = 0; + end_queued_requests(fc); + wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); fuse_conn_put(fc); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ada0adeb3bb5..9576aed1777c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/compat.h> static const struct file_operations fuse_direct_io_file_operations; @@ -85,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } +static void fuse_release_async(struct work_struct *work) +{ + struct fuse_req *req; + struct fuse_conn *fc; + struct path path; + + req = container_of(work, struct fuse_req, misc.release.work); + path = req->misc.release.path; + fc = get_fuse_conn(path.dentry->d_inode); + + fuse_put_request(fc, req); + path_put(&path); +} + static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - path_put(&req->misc.release.path); + if (fc->destroy_req) { + /* + * If this is a fuseblk mount, then it's possible that + * releasing the path will result in releasing the + * super block and sending the DESTROY request. If + * the server is single threaded, this would hang. + * For this reason do the path_put() in a separate + * thread. + */ + atomic_inc(&req->count); + INIT_WORK(&req->misc.release.work, fuse_release_async); + schedule_work(&req->misc.release.work); + } else { + path_put(&req->misc.release.path); + } } -static void fuse_file_put(struct fuse_file *ff) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - req->end = fuse_release_end; - fuse_request_send_background(ff->fc, req); + if (sync) { + fuse_request_send(ff->fc, req); + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + fuse_request_send_background(ff->fc, req); + } kfree(ff); } } @@ -134,6 +169,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open); void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); if (ff->open_flags & FOPEN_DIRECT_IO) file->f_op = &fuse_direct_io_file_operations; @@ -141,6 +177,15 @@ void fuse_finish_open(struct inode *inode, struct file *file) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) @@ -208,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode) * Normally this will send the RELEASE request, however if * some asynchronous READ or WRITE requests are outstanding, * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. */ - fuse_file_put(ff); + fuse_file_put(ff, ff->fc->destroy_req != NULL); } static int fuse_open(struct inode *inode, struct file *file) @@ -547,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) page_cache_release(page); } if (req->ff) - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1126,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1617,6 +1666,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, return 0; } +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + + for (n = 0; n < count; n++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +/* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + /* * For ioctls, there is no generic way to determine how much memory * needs to be read and/or written. Furthermore, ioctls are allowed @@ -1798,18 +1899,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - err = -EIO; - if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) - goto out; - - /* okay, copy in iovs and retry */ vaddr = kmap_atomic(pages[0], KM_USER0); - memcpy(page_address(iov_page), vaddr, transferred); + err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr, KM_USER0); + if (err) + goto out; in_iov = page_address(iov_page); out_iov = in_iov + in_iovs; + err = fuse_verify_ioctl_iov(in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(out_iov, out_iovs); + if (err) + goto out; + goto retry; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8f309f04064e..6b9a74604ec3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -21,6 +21,7 @@ #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/poll.h> +#include <linux/workqueue.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -257,7 +258,10 @@ struct fuse_req { union { struct fuse_forget_in forget_in; struct { - struct fuse_release_in in; + union { + struct fuse_release_in in; + struct work_struct work; + }; struct path path; } release; struct fuse_init_in init_in; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6a857e24f947..83917b50a196 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -932,7 +932,7 @@ int gfs2_logd(void *data) do { prepare_to_wait(&sdp->sd_logd_waitq, &wait, - TASK_UNINTERRUPTIBLE); + TASK_INTERRUPTIBLE); if (!gfs2_ail_flush_reqd(sdp) && !gfs2_jrnl_flush_reqd(sdp) && !kthread_should_stop()) diff --git a/fs/inode.c b/fs/inode.c index 722860b323a9..a4c0bf5dbf93 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -245,6 +245,20 @@ void destroy_inode(struct inode *inode) kmem_cache_free(inode_cachep, (inode)); } +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + spin_lock_init(&mapping->i_mmap_lock); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + mutex_init(&mapping->unmap_mutex); +} +EXPORT_SYMBOL(address_space_init_once); + /* * These are initializations that only need to be done * once, because the fields are idempotent across use @@ -256,13 +270,7 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); - INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); - INIT_LIST_HEAD(&inode->i_data.private_list); - spin_lock_init(&inode->i_data.private_lock); - INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); - INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); + address_space_init_once(&inode->i_data); i_size_ordered_init(inode); #ifdef CONFIG_INOTIFY INIT_LIST_HEAD(&inode->inotify_watches); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 28a9ddaa0c49..bf21291e179a 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -745,8 +745,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 93d1e47647bd..7a629b796e6f 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -435,9 +435,12 @@ int __log_space_left(journal_t *journal) int __log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakup the * commit thread. We do _not_ do the commit ourselves. @@ -449,7 +452,14 @@ int __log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index fa96bbb26343..2d7f165d0f1d 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -86,46 +86,25 @@ struct ea_buffer { #define EA_MALLOC 0x0008 +static int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + /* * These three routines are used to recognize on-disk extended attributes * that are in a recognized namespace. If the attribute is not recognized, * "os2." is prepended to the name */ -static inline int is_os2_xattr(struct jfs_ea *ea) +static int is_os2_xattr(struct jfs_ea *ea) { - /* - * Check for "system." - */ - if ((ea->namelen >= XATTR_SYSTEM_PREFIX_LEN) && - !strncmp(ea->name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return false; - /* - * Check for "user." - */ - if ((ea->namelen >= XATTR_USER_PREFIX_LEN) && - !strncmp(ea->name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return false; - /* - * Check for "security." - */ - if ((ea->namelen >= XATTR_SECURITY_PREFIX_LEN) && - !strncmp(ea->name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) - return false; - /* - * Check for "trusted." - */ - if ((ea->namelen >= XATTR_TRUSTED_PREFIX_LEN) && - !strncmp(ea->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) - return false; - /* - * Add any other valid namespace prefixes here - */ - - /* - * We assume it's OS/2's flat namespace - */ - return true; + return !is_known_namespace(ea->name); } static inline int name_size(struct jfs_ea *ea) @@ -764,13 +743,23 @@ static int can_set_xattr(struct inode *inode, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return can_set_system_xattr(inode, name, value, value_len); + if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) { + /* + * This makes sure that we aren't trying to set an + * attribute in a different namespace by prefixing it + * with "os2." + */ + if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN)) + return -EOPNOTSUPP; + return 0; + } + /* * Don't allow setting an attribute in an unknown namespace. */ if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && - strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && - strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return -EOPNOTSUPP; return 0; @@ -952,19 +941,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, int xattr_size; ssize_t size; int namelen = strlen(name); - char *os2name = NULL; char *value; - if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { - os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, - GFP_KERNEL); - if (!os2name) - return -ENOMEM; - strcpy(os2name, name + XATTR_OS2_PREFIX_LEN); - name = os2name; - namelen -= XATTR_OS2_PREFIX_LEN; - } - down_read(&JFS_IP(inode)->xattr_sem); xattr_size = ea_get(inode, &ea_buf, 0); @@ -1002,8 +980,6 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, out: up_read(&JFS_IP(inode)->xattr_sem); - kfree(os2name); - return size; } @@ -1012,6 +988,19 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, { int err; + if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { + /* + * skip past "os2." prefix + */ + name += XATTR_OS2_PREFIX_LEN; + /* + * Don't allow retrieving properly prefixed attributes + * by prepending them with "os2." + */ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + } + err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); return err; diff --git a/fs/minix/namei.c b/fs/minix/namei.c index e20ee85955d1..f3f3578393a4 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -115,7 +115,7 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode) inode_inc_link_count(dir); - inode = minix_new_inode(dir, mode, &err); + inode = minix_new_inode(dir, S_IFDIR | mode, &err); if (!inode) goto out_dir; diff --git a/fs/namespace.c b/fs/namespace.c index 88058de59c7c..32dcd24bbc9a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1984,7 +1984,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d25b5257b7a1..e0067708a553 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -274,7 +274,7 @@ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, sin1->sin6_scope_id != sin2->sin6_scope_id) return 0; - return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr); + return ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr); } #else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */ static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e60416d3f818..d69551e77b93 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1103,7 +1103,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) goto no_open_dput; /* We can't create new files, or truncate existing ones here */ - openflags &= ~(O_CREAT|O_TRUNC); + openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); /* * Note: we're not holding inode->i_mutex and so may be racing with diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ad4cd31d6050..b1c23b7c97cc 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -402,15 +402,18 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, pos += vec->iov_len; } + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + if (put_dreq(dreq)) nfs_direct_complete(dreq); - - if (requested_bytes != 0) - return 0; - - if (result < 0) - return result; - return -EIO; + return 0; } static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, @@ -830,15 +833,18 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, pos += vec->iov_len; } + /* + * If no bytes were started, return the error, and let the + * generic layer handle the completion. + */ + if (requested_bytes == 0) { + nfs_direct_req_release(dreq); + return result < 0 ? result : -EIO; + } + if (put_dreq(dreq)) nfs_direct_write_complete(dreq, dreq->inode); - - if (requested_bytes != 0) - return 0; - - if (result < 0) - return result; - return -EIO; + return 0; } static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f036153d9f50..6fd4fba4ed76 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -222,7 +222,7 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); if (have_error) ret = xchg(&ctx->error, 0); - if (!ret) + if (!ret && status < 0) ret = status; return ret; } @@ -560,7 +560,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct file *filp = vma->vm_file; struct dentry *dentry = filp->f_path.dentry; unsigned pagelen; - int ret = -EINVAL; + int ret = VM_FAULT_NOPAGE; struct address_space *mapping; dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n", @@ -576,21 +576,20 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (mapping != dentry->d_inode->i_mapping) goto out_unlock; - ret = 0; pagelen = nfs_page_length(page); if (pagelen == 0) goto out_unlock; - ret = nfs_flush_incompatible(filp, page); - if (ret != 0) - goto out_unlock; + ret = VM_FAULT_LOCKED; + if (nfs_flush_incompatible(filp, page) == 0 && + nfs_updatepage(filp, page, 0, pagelen) == 0) + goto out; - ret = nfs_updatepage(filp, page, 0, pagelen); + ret = VM_FAULT_SIGBUS; out_unlock: - if (!ret) - return VM_FAULT_LOCKED; unlock_page(page); - return VM_FAULT_SIGBUS; +out: + return ret; } static const struct vm_operations_struct nfs_file_vm_ops = { @@ -697,6 +696,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = filp->f_mapping->host; int status = 0; + unsigned int saved_type = fl->fl_type; /* Try local locking first */ posix_test_lock(filp, fl); @@ -704,6 +704,7 @@ static int do_getlk(struct file *filp, int cmd, struct file_lock *fl) /* found a conflict */ goto out; } + fl->fl_type = saved_type; if (nfs_have_delegation(inode, FMODE_READ)) goto out_noconflict; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 099b3518feea..0cafbdb522c5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -811,9 +811,10 @@ out: return ret; } -static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); + unsigned long ret = 0; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -821,25 +822,32 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->change_attr = fattr->change_attr; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; } /* If we have atomic WCC data, we may update some attributes */ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + && timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + ret |= NFS_INO_INVALID_ATTR; + } if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + ret |= NFS_INO_INVALID_ATTR; } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->npages == 0) - i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + && nfsi->npages == 0) { + i_size_write(inode, nfs_size_to_loff_t(fattr->size)); + ret |= NFS_INO_INVALID_ATTR; + } + return ret; } /** @@ -1153,7 +1161,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); + invalid |= nfs_wcc_update_inode(inode, fattr); /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 59047f8d7d72..3dde50c093b5 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -503,13 +503,13 @@ static struct rpc_procinfo mnt3_procedures[] = { static struct rpc_version mnt_version1 = { .number = 1, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt_procedures), .procs = mnt_procedures, }; static struct rpc_version mnt_version3 = { .number = 3, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt3_procedures), .procs = mnt3_procedures, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 70015dd60a98..80943d88252a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -255,9 +255,6 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, nfs4_state_mark_reclaim_nograce(clp, state); goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - if (state == NULL) - break; - nfs4_state_mark_reclaim_reboot(clp, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: goto do_state_recovery; @@ -1102,6 +1099,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * clear_bit(NFS_DELEGATED_STATE, &state->flags); smp_rmb(); if (state->n_rdwr != 0) { + clear_bit(NFS_O_RDWR_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1109,6 +1107,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_wronly != 0) { + clear_bit(NFS_O_WRONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate); if (ret != 0) return ret; @@ -1116,6 +1115,7 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state * return -ESTALE; } if (state->n_rdonly != 0) { + clear_bit(NFS_O_RDONLY_STATE, &state->flags); ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate); if (ret != 0) return ret; @@ -2023,7 +2023,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) struct rpc_cred *cred; struct nfs4_state *state; struct dentry *res; - fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + int open_flags = nd->intent.open.flags; + fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; @@ -2031,8 +2032,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (!IS_POSIXACL(dir)) attr.ia_mode &= ~current_umask(); } else { + open_flags &= ~O_EXCL; attr.ia_valid = 0; - BUG_ON(nd->intent.open.flags & O_CREAT); + BUG_ON(open_flags & O_CREAT); } cred = rpc_lookup_cred(); @@ -2041,7 +2043,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred); + state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); put_rpccred(cred); if (IS_ERR(state)) { if (PTR_ERR(state) == -ENOENT) { @@ -3273,6 +3275,35 @@ static void buf_to_pages(const void *buf, size_t buflen, } } +static int buf_to_pages_noslab(const void *buf, size_t buflen, + struct page **pages, unsigned int *pgbase) +{ + struct page *newpage, **spages; + int rc = 0; + size_t len; + spages = pages; + + do { + len = min_t(size_t, PAGE_CACHE_SIZE, buflen); + newpage = alloc_page(GFP_KERNEL); + + if (newpage == NULL) + goto unwind; + memcpy(page_address(newpage), buf, len); + buf += len; + buflen -= len; + *pages++ = newpage; + rc++; + } while (buflen != 0); + + return rc; + +unwind: + for(; rc > 0; rc--) + __free_page(spages[rc-1]); + return -ENOMEM; +} + struct nfs4_cached_acl { int cached; size_t len; @@ -3439,13 +3470,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl .rpc_argp = &arg, .rpc_resp = &res, }; - int ret; + int ret, i; if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; + i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + if (i < 0) + return i; nfs_inode_return_delegation(inode); - buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + + /* + * Free each page after tx, so the only ref left is + * held by the network stack + */ + for (; i > 0; i--) + put_page(pages[i-1]); + nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); return ret; @@ -3477,9 +3518,6 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, nfs4_state_mark_reclaim_nograce(clp, state); goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - if (state == NULL) - break; - nfs4_state_mark_reclaim_reboot(clp, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: goto do_state_recovery; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 34acf5926fdc..55bfe0e2afe8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1111,17 +1111,14 @@ static void nfs4_reclaim_complete(struct nfs_client *clp, (void)ops->reclaim_complete(clp); } -static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +static int nfs4_state_clear_reclaim_reboot(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rb_node *pos; struct nfs4_state *state; if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) - return; - - nfs4_reclaim_complete(clp, - nfs4_reboot_recovery_ops[clp->cl_minorversion]); + return 0; for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); @@ -1135,6 +1132,14 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) } nfs_delegation_reap_unclaimed(clp); + return 1; +} + +static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) +{ + if (!nfs4_state_clear_reclaim_reboot(clp)) + return; + nfs4_reclaim_complete(clp, nfs4_reboot_recovery_ops[clp->cl_minorversion]); } static void nfs_delegation_clear_all(struct nfs_client *clp) @@ -1161,7 +1166,7 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_end_reclaim_reboot(clp); + nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: @@ -1405,7 +1410,7 @@ static void nfs4_state_manager(struct nfs_client *clp) int status = 0; /* Ensure exclusive access to NFSv4 state */ - for(;;) { + do { if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) { /* We're going to have to re-establish a clientid */ status = nfs4_reclaim_lease(clp); @@ -1488,7 +1493,7 @@ static void nfs4_state_manager(struct nfs_client *clp) break; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) break; - } + } while (atomic_read(&clp->cl_count) > 1); return; out_error: printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s" diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f9df16de4a56..f9fce46a9713 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -431,7 +431,15 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) goto out_err; error = server->nfs_client->rpc_ops->statfs(server, fh, &res); + if (unlikely(error == -ESTALE)) { + struct dentry *pd_dentry; + pd_dentry = dget_parent(dentry); + if (pd_dentry != NULL) { + nfs_zap_caches(pd_dentry->d_inode); + dput(pd_dentry); + } + } nfs_free_fattr(res.fattr); if (error < 0) goto out_err; @@ -652,6 +660,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->options & NFS_OPTION_FSCACHE) seq_printf(m, ",fsc"); + + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + seq_printf(m, ",lookupcache=none"); + else + seq_printf(m, ",lookupcache=pos"); + } } /* @@ -1986,6 +2001,15 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) if (error < 0) goto out; + /* + * noac is a special case. It implies -o sync, but that's not + * necessarily reflected in the mtab options. do_remount_sb + * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * remount options, so we have to explicitly reset it. + */ + if (data->flags & NFS_MOUNT_NOAC) + *flags |= MS_SYNCHRONOUS; + /* compare new mount options with old ones */ error = nfs_compare_remount_data(nfss, data); out: diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 0c6d81670137..7c831a2731fa 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -38,7 +38,6 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) exp_readlock(); nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); fh_put(&fh); - rqstp->rq_client = NULL; exp_readunlock(); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2a533a0af2a9..7e84a852cdae 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp) err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &fhp->fh_post_attr); fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; - if (err) + if (err) { fhp->fh_post_saved = 0; - else + /* Grab the ctime anyway - set_change_info might use it */ + fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + } else fhp->fh_post_saved = 1; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 59ec449b0c7f..bb775a595989 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -954,8 +954,8 @@ typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, void *); enum nfsd4_op_flags { ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ - ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */ - ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */ + ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ + ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ }; struct nfsd4_operation { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ac17a7080239..d88539d93caf 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -316,8 +316,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) - goto out_nfserr; + if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) + return status; iattr->ia_valid |= ATTR_UID; } if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { @@ -327,8 +327,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(dummy32); len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); - if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) - goto out_nfserr; + if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) + return status; iattr->ia_valid |= ATTR_GID; } if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { @@ -1180,8 +1180,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(4); READ32(dummy); READ_BUF(dummy * 4); - for (i = 0; i < dummy; ++i) - READ32(dummy); break; case RPC_AUTH_GSS: dprintk("RPC_AUTH_GSS callback secflavor " @@ -1197,7 +1195,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(4); READ32(dummy); READ_BUF(dummy); - p += XDR_QUADLEN(dummy); break; default: dprintk("Illegal callback secflavor\n"); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 3c111120b619..a57c453ddfd6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -819,7 +819,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino) if (ra->p_count == 0) frap = rap; } - depth = nfsdstats.ra_size*11/10; + depth = nfsdstats.ra_size; if (!frap) { spin_unlock(&rab->pb_lock); return NULL; @@ -1386,7 +1386,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; if (!(iap->ia_valid & ATTR_MODE)) iap->ia_mode = 0; - err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC); if (err) goto out; @@ -1408,6 +1408,13 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dchild)) goto out_nfserr; + /* If file doesn't exist, check for permissions to create one */ + if (!dchild->d_inode) { + err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); + if (err) + goto out; + } + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); if (err) goto out; diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 4d476ff08ae6..60fce3dc5cb5 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { - BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); - cinfo->atomic = 1; + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = fhp->fh_post_saved; cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); - if (cinfo->change_supported) { - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; - } else { - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 447ce47a3306..7133a6145cfe 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,19 +35,6 @@ #include "btnode.h" -void nilfs_btnode_cache_init_once(struct address_space *btnc) -{ - memset(btnc, 0, sizeof(*btnc)); - INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC); - spin_lock_init(&btnc->tree_lock); - INIT_LIST_HEAD(&btnc->private_list); - spin_lock_init(&btnc->private_lock); - - spin_lock_init(&btnc->i_mmap_lock); - INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap); - INIT_LIST_HEAD(&btnc->i_mmap_nonlinear); -} - static const struct address_space_operations def_btnode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 07da83f07712..fa2f1e68f4d1 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; -void nilfs_btnode_cache_init_once(struct address_space *); void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index c9a30d7ff6fc..d9d5d8141b4f 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -72,10 +72,9 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* * check to see if the page is mapped already (no holes) */ - if (PageMappedToDisk(page)) { - unlock_page(page); + if (PageMappedToDisk(page)) goto mapped; - } + if (page_has_buffers(page)) { struct buffer_head *bh, *head; int fully_mapped = 1; @@ -90,7 +89,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (fully_mapped) { SetPageMappedToDisk(page); - unlock_page(page); goto mapped; } } @@ -105,16 +103,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; ret = block_page_mkwrite(vma, vmf, nilfs_get_block); - if (unlikely(ret)) { + if (ret != VM_FAULT_LOCKED) { nilfs_transaction_abort(inode->i_sb); return ret; } + nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, + 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: SetPageChecked(page); wait_on_page_writeback(page); - return 0; + return VM_FAULT_LOCKED; } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 414ef68931cf..79542d014e02 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -336,9 +336,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); + err = -ENOMEM; sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); if (!sbi->s_ifile) - return -ENOMEM; + goto delist; down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, @@ -369,6 +370,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; + delist: down_write(&nilfs->ns_super_sem); list_del_init(&sbi->s_list); up_write(&nilfs->ns_super_sem); @@ -1118,7 +1120,7 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + address_space_init_once(&ii->i_btnode_cache); ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; inode_init_once(&ii->vfs_inode); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e46ca685b9be..72f882552608 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -96,8 +96,11 @@ static inline __u32 inotify_arg_to_mask(u32 arg) { __u32 mask; - /* everything should accept their own ignored and cares about children */ - mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); + /* + * everything should accept their own ignored, cares about children, + * and should receive events when the inode is unmounted + */ + mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD | FS_UNMOUNT); /* mask off the flags used to open the fd */ mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); @@ -671,6 +674,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) if (ret >= 0) return ret; + fsnotify_put_group(group); atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index da702294d7e7..a76e0aa5cd3f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -290,12 +290,30 @@ static int ocfs2_set_acl(handle_t *handle, int ocfs2_check_acl(struct inode *inode, int mask) { - struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret = -EAGAIN; - if (IS_ERR(acl)) + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return ret; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh); + + brelse(di_bh); + + if (IS_ERR(acl)) { + mlog_errno(PTR_ERR(acl)); return PTR_ERR(acl); + } if (acl) { - int ret = posix_acl_permission(inode, acl, mask); + ret = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); return ret; } @@ -344,7 +362,7 @@ int ocfs2_init_acl(handle_t *handle, { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; - int ret = 0; + int ret = 0, ret2; mode_t mode; if (!S_ISLNK(inode->i_mode)) { @@ -381,7 +399,12 @@ int ocfs2_init_acl(handle_t *handle, mode = inode->i_mode; ret = posix_acl_create_masq(clone, &mode); if (ret >= 0) { - ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } if (ret > 0) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_ACCESS, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 356e976772bf..1514e271d1b9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -578,7 +578,9 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private) + void *private, + int ret, + bool is_async) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int level; @@ -592,6 +594,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (!level) up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); + + if (is_async) + aio_complete(iocb, ret, 0); } /* @@ -1034,6 +1039,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, &cluster_start, &cluster_end); + /* treat the write as new if the a hole/lseek spanned across + * the page boundary. + */ + new = new | ((i_size_read(inode) <= page_offset(page)) && + (page_offset(page) <= user_pos)); + if (page == wc->w_target_page) { map_from = user_pos & (PAGE_CACHE_SIZE - 1); map_to = map_from + user_len; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 94b97fc6a88e..ffb4c68dafa4 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref) atomic_dec(&dlm->res_cur_count); - dlm_put(dlm); - if (!hlist_unhashed(&res->hash_node) || !list_empty(&res->granted) || !list_empty(&res->converting) || @@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->migration_pending = 0; res->inflight_locks = 0; - /* put in dlm_lockres_release */ - dlm_grab(dlm); res->dlm = dlm; kref_init(&res->refs); @@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, /* check for pre-existing lock */ spin_lock(&dlm->spinlock); res = __dlm_lookup_lockres(dlm, name, namelen, hash); - spin_lock(&dlm->master_lock); - if (res) { spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&res->spinlock); } + spin_lock(&dlm->master_lock); /* ignore status. only nonzero status would BUG. */ ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, migrate->new_master, migrate->master); -unlock: spin_unlock(&dlm->master_lock); +unlock: spin_unlock(&dlm->spinlock); if (oldmle) { diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 9dfaac73b36d..aaaffbcbe916 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, struct list_head *queue; struct dlm_lock *lock, *next; + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); res->state |= DLM_LOCK_RES_RECOVERING; if (!list_empty(&res->recovering)) { mlog(0, @@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { - if (res->state & DLM_LOCK_RES_DROPPING_REF) - mlog(0, "%s:%.*s: owned by " - "dead node %u, this node was " - "dropping its ref when it died. " - "continue, dropping the flag.\n", - dlm->name, res->lockname.len, - res->lockname.name, dead_node); - - /* the wake_up for this will happen when the - * RECOVERING flag is dropped later */ - res->state &= ~DLM_LOCK_RES_DROPPING_REF; + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(ML_NOTICE, "Ignore %.*s for " + "recovery as it is being freed\n", + res->lockname.len, + res->lockname.name); + } else + dlm_move_lockres_to_recovery_list(dlm, + res); - dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index d4f73ca68fe5..2211acf33d9b 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res) * truly ready to be freed. */ int __dlm_lockres_unused(struct dlm_lock_resource *res) { - if (!__dlm_lockres_has_locks(res) && - (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { - /* try not to scan the bitmap unless the first two - * conditions are already true */ - int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); - if (bit >= O2NM_MAX_NODES) { - /* since the bit for dlm->node_num is not - * set, inflight_locks better be zero */ - BUG_ON(res->inflight_locks != 0); - return 1; - } - } - return 0; + int bit; + + if (__dlm_lockres_has_locks(res)) + return 0; + + if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) + return 0; + + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) + return 0; + + /* + * since the bit for dlm->node_num is not set, inflight_locks better + * be zero + */ + BUG_ON(res->inflight_locks != 0); + return 1; } @@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } -static int dlm_purge_lockres(struct dlm_ctxt *dlm, +static void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int master; int ret = 0; - spin_lock(&res->spinlock); - if (!__dlm_lockres_unused(res)) { - mlog(0, "%s:%.*s: tried to purge but not unused\n", - dlm->name, res->lockname.len, res->lockname.name); - __dlm_print_one_lock_resource(res); - spin_unlock(&res->spinlock); - BUG(); - } - - if (res->state & DLM_LOCK_RES_MIGRATING) { - mlog(0, "%s:%.*s: Delay dropref as this lockres is " - "being remastered\n", dlm->name, res->lockname.len, - res->lockname.name); - /* Re-add the lockres to the end of the purge list */ - if (!list_empty(&res->purge)) { - list_del_init(&res->purge); - list_add_tail(&res->purge, &dlm->purge_list); - } - spin_unlock(&res->spinlock); - return 0; - } + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); master = (res->owner == dlm->node_num); - if (!master) - res->state |= DLM_LOCK_RES_DROPPING_REF; - spin_unlock(&res->spinlock); mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, res->lockname.name, master); if (!master) { + res->state |= DLM_LOCK_RES_DROPPING_REF; /* drop spinlock... retake below */ + spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); spin_lock(&res->spinlock); @@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", dlm->name, res->lockname.len, res->lockname.name, ret); spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); } - spin_lock(&res->spinlock); if (!list_empty(&res->purge)) { mlog(0, "removing lockres %.*s:%p from purgelist, " "master = %d\n", res->lockname.len, res->lockname.name, res, master); list_del_init(&res->purge); - spin_unlock(&res->spinlock); dlm_lockres_put(res); dlm->purge_count--; - } else - spin_unlock(&res->spinlock); + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } __dlm_unhash_lockres(res); /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { - spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_DROPPING_REF; spin_unlock(&res->spinlock); wake_up(&res->wq); - } - return 0; + } else + spin_unlock(&res->spinlock); } static void dlm_run_purge_list(struct dlm_ctxt *dlm, @@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, lockres = list_entry(dlm->purge_list.next, struct dlm_lock_resource, purge); - /* Status of the lockres *might* change so double - * check. If the lockres is unused, holding the dlm - * spinlock will prevent people from getting and more - * refs on it -- there's no need to keep the lockres - * spinlock. */ spin_lock(&lockres->spinlock); - unused = __dlm_lockres_unused(lockres); - spin_unlock(&lockres->spinlock); - - if (!unused) - continue; purge_jiffies = lockres->last_used + msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); @@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * in tail order, we can stop at the first * unpurgable resource -- anyone added after * him will have a greater last_used value */ + spin_unlock(&lockres->spinlock); break; } + /* Status of the lockres *might* change so double + * check. If the lockres is unused, holding the dlm + * spinlock will prevent people from getting and more + * refs on it. */ + unused = __dlm_lockres_unused(lockres); + if (!unused || + (lockres->state & DLM_LOCK_RES_MIGRATING)) { + mlog(0, "lockres %s:%.*s: is in use or " + "being remastered, used %d, state %d\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, !unused, lockres->state); + list_move_tail(&dlm->purge_list, &lockres->purge); + spin_unlock(&lockres->spinlock); + continue; + } + dlm_lockres_get(lockres); - /* This may drop and reacquire the dlm spinlock if it - * has to do migration. */ - if (dlm_purge_lockres(dlm, lockres)) - BUG(); + dlm_purge_lockres(dlm, lockres); dlm_lockres_put(lockres); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index abb0a95cc717..201e7bcec75e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); - if (!status) + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3ac5aa733e9c..c90b2dd25e0f 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2436,16 +2436,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; /* - * If the refcount rec already exist, cool. We just need - * to check whether there is a split. Otherwise we just need - * to increase the refcount. - * If we will insert one, increases recs_add. - * * We record all the records which will be inserted to the * same refcount block, so that we can tell exactly whether * we need a new refcount block or not. + * + * If we will insert a new one, this is easy and only happens + * during adding refcounted flag to the extent, so we don't + * have a chance of spliting. We just need one record. + * + * If the refcount rec already exists, that would be a little + * complicated. we may have to: + * 1) split at the beginning if the start pos isn't aligned. + * we need 1 more record in this case. + * 2) split int the end if the end pos isn't aligned. + * we need 1 more record in this case. + * 3) split in the middle because of file system fragmentation. + * we need 2 more records in this case(we can't detect this + * beforehand, so always think of the worst case). */ if (rec.r_refcount) { + recs_add += 2; /* Check whether we need a split at the beginning. */ if (cpos == start_cpos && cpos != le64_to_cpu(rec.r_cpos)) @@ -3205,7 +3215,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, u32 num_clusters, unsigned int e_flags) { int ret, delete, index, credits = 0; - u32 new_bit, new_len; + u32 new_bit, new_len, orig_num_clusters; unsigned int set_len; struct ocfs2_super *osb = OCFS2_SB(sb); handle_t *handle; @@ -3238,6 +3248,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, goto out; } + orig_num_clusters = num_clusters; + while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, p_cluster, num_clusters, @@ -3325,7 +3337,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { - ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + ret = ocfs2_cow_sync_writeback(sb, context, cpos, + orig_num_clusters); if (ret) mlog_errno(ret); } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 2dc57bca0688..22db114a4f0a 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -191,7 +191,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) return c; } - return c; + return NULL; } /* diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 32499d213fc4..9975457c981f 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -128,7 +128,7 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry, } /* Fast symlinks can't be large */ - len = strlen(target); + len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb)); link = kzalloc(len + 1, GFP_NOFS); if (!link) { status = -ENOMEM; diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 9efb2cfe2410..3e0ee124a1b3 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -347,6 +347,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Check that sizeof_partition_entry has the correct value */ + if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { + pr_debug("GUID Partitition Entry Size check failed.\n"); + goto fail; + } + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 648c9d8f3357..db07e6dcc0e9 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) } vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + vm->vblk_offset = get_unaligned_be32(data + 0x0C); vm->last_vblk_seq = get_unaligned_be32(data + 0x04); @@ -1294,6 +1299,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) BUG_ON (!data || !frags); + if (size < 2 * VBLK_SIZE_HEAD) { + ldm_error("Value of size is to small."); + return false; + } + group = get_unaligned_be32(data + 0x08); rec = get_unaligned_be16(data + 0x0C); num = get_unaligned_be16(data + 0x0E); @@ -1301,6 +1311,10 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) ldm_error ("A VBLK claims to have %d parts.", num); return false; } + if (rec >= num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); + return false; + } list_for_each (item, frags) { f = list_entry (item, struct frag, list); @@ -1321,6 +1335,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) list_add_tail (&f->list, frags); found: + if (rec >= f->num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); + return false; + } + if (f->map & (1 << rec)) { ldm_error ("Duplicate VBLK, part %d.", rec); f->map &= 0x7F; /* Mark the group as broken */ @@ -1329,10 +1348,9 @@ found: f->map |= (1 << rec); - if (num > 0) { - data += VBLK_SIZE_HEAD; - size -= VBLK_SIZE_HEAD; - } + data += VBLK_SIZE_HEAD; + size -= VBLK_SIZE_HEAD; + memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size); return true; diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index 74465ff7c263..0984d92bcf86 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len) int mac_partition(struct parsed_partitions *state) { - int slot = 1; Sector sect; unsigned char *data; - int blk, blocks_in_map; + int slot, blocks_in_map; unsigned secsize; #ifdef CONFIG_PPC_PMAC int found_root = 0; @@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; /* not a MacOS disk */ } - printk(" [mac]"); blocks_in_map = be32_to_cpu(part->map_count); - for (blk = 1; blk <= blocks_in_map; ++blk) { - int pos = blk * secsize; + if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { + put_dev_sector(sect); + return 0; + } + printk(" [mac]"); + for (slot = 1; slot <= blocks_in_map; ++slot) { + int pos = slot * secsize; put_dev_sector(sect); data = read_part_sector(state, pos/512, §); if (!data) @@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state) } if (goodness > found_root_goodness) { - found_root = blk; + found_root = slot; found_root_goodness = goodness; } } #endif /* CONFIG_PPC_PMAC */ - - ++slot; } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index fc22b85d436a..5f71c928af76 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,10 +10,13 @@ #include "check.h" #include "osf.h" +#define MAX_OSF_PARTITIONS 18 + int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; + unsigned int npartitions; Sector sect; unsigned char *data; struct disklabel { @@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state) u8 p_fstype; u8 p_frag; __le16 p_cpg; - } d_partitions[8]; + } d_partitions[MAX_OSF_PARTITIONS]; } * label; struct d_partition * partition; @@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; } - for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) { + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { if (slot == state->limit) break; if (le32_to_cpu(partition->p_size)) diff --git a/fs/pipe.c b/fs/pipe.c index 279eef96c51c..a58d7ee7ad18 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -382,7 +382,7 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, error = ops->confirm(pipe, buf); if (error) { if (!ret) - error = ret; + ret = error; break; } @@ -1197,12 +1197,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, return ret; } +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +struct pipe_inode_info *get_pipe_info(struct file *file) +{ + struct inode *i = file->f_path.dentry->d_inode; + + return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; long ret; - pipe = file->f_path.dentry->d_inode->i_pipe; + pipe = get_pipe_info(file); if (!pipe) return -EBADF; diff --git a/fs/proc/array.c b/fs/proc/array.c index fff6572676ae..9e5f43084912 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_cap(m, task); task_cpus_allowed(m, task); cpuset_task_status_allowed(m, task); -#if defined(CONFIG_S390) - task_show_regs(m, task); -#endif task_context_switch_counts(m, task); return 0; } @@ -492,8 +489,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, vsize, mm ? get_mm_rss(mm) : 0, rsslim, - mm ? mm->start_code : 0, - mm ? mm->end_code : 0, + mm ? (permitted ? mm->start_code : 1) : 0, + mm ? (permitted ? mm->end_code : 1) : 0, (permitted && mm) ? mm->start_stack : 0, esp, eip, diff --git a/fs/proc/base.c b/fs/proc/base.c index acb7ef80ea4f..c410e23ecd79 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2514,6 +2514,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) struct task_io_accounting acct = task->ioac; unsigned long flags; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + return -EACCES; + if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2636,7 +2639,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tgid_io_accounting), + INF("io", S_IRUSR, proc_tgid_io_accounting), #endif }; @@ -2867,11 +2870,16 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); + unsigned int nr; + struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) + goto out_no_task; + nr = filp->f_pos - FIRST_PROCESS_ENTRY; + + reaper = get_proc_task(filp->f_path.dentry->d_inode); if (!reaper) goto out_no_task; @@ -2967,7 +2975,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tid_io_accounting), + INF("io", S_IRUSR, proc_tid_io_accounting), #endif }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 439fc1f1c1c4..0dfd815e4cfd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -224,7 +224,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; if (vma->vm_flags & VM_GROWSDOWN) - start += PAGE_SIZE; + if (!vma_stack_continue(vma->vm_prev, vma->vm_start)) + start += PAGE_SIZE; seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", start, @@ -247,8 +248,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) const char *name = arch_vma_name(vma); if (!name) { if (mm) { - if (vma->vm_start <= mm->start_brk && - vma->vm_end >= mm->brk) { + if (vma->vm_start <= mm->brk && + vma->vm_end >= mm->start_brk) { name = "[heap]"; } else if (vma->vm_start <= mm->start_stack && vma->vm_end >= mm->start_stack) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 91c817ff02c3..2367fb3f70bc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -163,7 +163,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, static const struct file_operations proc_vmcore_operations = { .read = read_vmcore, - .llseek = generic_file_llseek, + .llseek = default_llseek, }; static struct vmcore* __init get_new_element(void) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 437d2ca2de97..42b08818d0e2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -422,7 +422,7 @@ EXPORT_SYMBOL(dquot_acquire); */ int dquot_commit(struct dquot *dquot) { - int ret = 0, ret2 = 0; + int ret = 0; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dqopt->dqio_mutex); @@ -434,15 +434,10 @@ int dquot_commit(struct dquot *dquot) spin_unlock(&dq_list_lock); /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { + if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); - if (info_dirty(&dqopt->info[dquot->dq_type])) { - ret2 = dqopt->ops[dquot->dq_type]->write_file_info( - dquot->dq_sb, dquot->dq_type); - } - if (ret >= 0) - ret = ret2; - } + else + ret = -EIO; out_sem: mutex_unlock(&dqopt->dqio_mutex); return ret; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index d532c20fc179..9bfb81b8bffa 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -112,6 +112,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) SetPageDirty(page); unlock_page(page); + put_page(page); } return 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index f53505de0712..4131f4a49391 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -170,6 +170,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; + int depth; int index; struct page *page; struct address_space *mapping; @@ -185,11 +186,10 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) return 0; } - /* we need to make sure nobody is changing the file size beneath - ** us - */ - mutex_lock(&inode->i_mutex); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); + + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ @@ -224,6 +224,6 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) out: mutex_unlock(&inode->i_mutex); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); return retval; } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 536d697a8a28..90d2fcb67a31 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode) struct reiserfs_transaction_handle th; size_t size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(clone->a_count)); - reiserfs_write_lock(inode->i_sb); + int depth; + + depth = reiserfs_write_lock_once(inode->i_sb); error = journal_begin(&th, inode->i_sb, size * 2); if (!error) { int error2; @@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode) if (error2) error = error2; } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); } posix_acl_release(clone); return error; diff --git a/fs/signalfd.c b/fs/signalfd.c index f329849ce3c0..1c5a6add779d 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; case __SI_POLL: err |= __put_user(kinfo->si_band, &uinfo->ssi_band); @@ -111,6 +112,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); + err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; default: /* diff --git a/fs/splice.c b/fs/splice.c index efdbfece9932..188f61aa79c9 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1323,18 +1323,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -/* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. - */ -static inline struct pipe_inode_info *pipe_info(struct inode *inode) -{ - if (S_ISFIFO(inode->i_mode)) - return inode->i_pipe; - - return NULL; -} /* * Determine where to splice to/from. @@ -1348,8 +1336,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, loff_t offset, *off; long ret; - ipipe = pipe_info(in->f_path.dentry->d_inode); - opipe = pipe_info(out->f_path.dentry->d_inode); + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); if (ipipe && opipe) { if (off_in || off_out) @@ -1567,7 +1555,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, int error; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -1654,7 +1642,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, }; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -2034,8 +2022,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); - struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; /* diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c index 12b933ac6585..a37d445cfff9 100644 --- a/fs/squashfs/dir.c +++ b/fs/squashfs/dir.c @@ -172,6 +172,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) length += sizeof(dirh); dir_count = le32_to_cpu(dirh.count) + 1; + + /* dir_count should never be larger than 256 */ + if (dir_count > 256) + goto failed_read; + while (dir_count--) { /* * Read directory entry. @@ -183,6 +188,10 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) size = le16_to_cpu(dire->size) + 1; + /* size should never be larger than SQUASHFS_NAME_LEN */ + if (size > SQUASHFS_NAME_LEN) + goto failed_read; + err = squashfs_read_metadata(inode->i_sb, dire->name, &block, &offset, size); if (err < 0) diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c index 7a9464d08cf6..5d922a6701ab 100644 --- a/fs/squashfs/namei.c +++ b/fs/squashfs/namei.c @@ -176,6 +176,11 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, length += sizeof(dirh); dir_count = le32_to_cpu(dirh.count) + 1; + + /* dir_count should never be larger than 256 */ + if (dir_count > 256) + goto data_error; + while (dir_count--) { /* * Read directory entry. @@ -187,6 +192,10 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, size = le16_to_cpu(dire->size) + 1; + /* size should never be larger than SQUASHFS_NAME_LEN */ + if (size > SQUASHFS_NAME_LEN) + goto data_error; + err = squashfs_read_metadata(dir->i_sb, dire->name, &block, &offset, size); if (err < 0) @@ -228,6 +237,9 @@ exit_lookup: d_add(dentry, inode); return ERR_PTR(0); +data_error: + err = -EIO; + read_failure: ERROR("Unable to read directory block [%llx:%x]\n", squashfs_i(dir)->start + msblk->directory_table, diff --git a/fs/super.c b/fs/super.c index 938119ab8dcb..c7765bd38ee2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -305,8 +305,13 @@ retry: if (s) { up_write(&s->s_umount); destroy_super(s); + s = NULL; } down_write(&old->s_umount); + if (unlikely(!(old->s_flags & MS_BORN))) { + deactivate_locked_super(old); + goto retry; + } return old; } } @@ -909,6 +914,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void goto out_free_secdata; BUG_ON(!mnt->mnt_sb); WARN_ON(!mnt->mnt_sb->s_bdi); + mnt->mnt_sb->s_flags |= MS_BORN; error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); if (error) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1beaa739d0a6..cd796847c813 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) char *p; p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (p) + if (!IS_ERR(p)) memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 37fa7ed062d8..de01f28cf44c 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -519,7 +519,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) size_t sz; if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) - goto out; + return 0; INIT_LIST_HEAD(&list); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index c2a68baa782f..a0c09fc49962 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -961,11 +961,39 @@ void dbg_dump_index(struct ubifs_info *c) void dbg_save_space_info(struct ubifs_info *c) { struct ubifs_debug_info *d = c->dbg; - - ubifs_get_lp_stats(c, &d->saved_lst); + int freeable_cnt; spin_lock(&c->space_lock); + memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); + + /* + * We use a dirty hack here and zero out @c->freeable_cnt, because it + * affects the free space calculations, and UBIFS might not know about + * all freeable eraseblocks. Indeed, we know about freeable eraseblocks + * only when we read their lprops, and we do this only lazily, upon the + * need. So at any given point of time @c->freeable_cnt might be not + * exactly accurate. + * + * Just one example about the issue we hit when we did not zero + * @c->freeable_cnt. + * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the + * amount of free space in @d->saved_free + * 2. We re-mount R/W, which makes UBIFS to read the "lsave" + * information from flash, where we cache LEBs from various + * categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()' + * -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()' + * -> 'ubifs_get_pnode()' -> 'update_cats()' + * -> 'ubifs_add_to_cat()'). + * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt + * becomes %1. + * 4. We calculate the amount of free space when the re-mount is + * finished in 'dbg_check_space_info()' and it does not match + * @d->saved_free. + */ + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; d->saved_free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; spin_unlock(&c->space_lock); } @@ -982,12 +1010,15 @@ int dbg_check_space_info(struct ubifs_info *c) { struct ubifs_debug_info *d = c->dbg; struct ubifs_lp_stats lst; - long long avail, free; + long long free; + int freeable_cnt; spin_lock(&c->space_lock); - avail = ubifs_calc_available(c, c->min_idx_lebs); + freeable_cnt = c->freeable_cnt; + c->freeable_cnt = 0; + free = ubifs_get_free_space_nolock(c); + c->freeable_cnt = freeable_cnt; spin_unlock(&c->space_lock); - free = ubifs_get_free_space(c); if (free != d->saved_free) { ubifs_err("free space changed from %lld to %lld", @@ -2656,19 +2687,19 @@ int dbg_debugfs_init_fs(struct ubifs_info *c) } fname = "dump_lprops"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_lprops = dent; fname = "dump_budg"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_budg = dent; fname = "dump_tnc"; - dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops); if (IS_ERR(dent)) goto out_remove; d->dfs_dump_tnc = dent; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 12f445cee9f7..47f55ea9ddad 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1315,6 +1315,9 @@ int ubifs_fsync(struct file *file, int datasync) dbg_gen("syncing inode %lu", inode->i_ino); + if (inode->i_sb->s_flags & MS_RDONLY) + return 0; + /* * VFS has already synchronized dirty pages for this inode. Synchronize * the inode unless this is a 'datasync()' call. diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index d321baeca68d..841f77cc93cf 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -665,6 +665,7 @@ out_free: out_release: release_head(c, BASEHD); + kfree(dent); out_ro: ubifs_ro_mode(c, err); if (last_reference) diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index ad7f67b827ea..ead230e08785 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -1270,10 +1270,9 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) lnum = branch->lnum; offs = branch->offs; pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS); - if (!pnode) { - err = -ENOMEM; - goto out; - } + if (!pnode) + return -ENOMEM; + if (lnum == 0) { /* * This pnode was not written which just means that the LEB diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 109c6ea03bb5..b712ed6fb6c4 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -300,6 +300,32 @@ int ubifs_recover_master_node(struct ubifs_info *c) goto out_free; } memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ); + + /* + * We had to recover the master node, which means there was an + * unclean reboot. However, it is possible that the master node + * is clean at this point, i.e., %UBIFS_MST_DIRTY is not set. + * E.g., consider the following chain of events: + * + * 1. UBIFS was cleanly unmounted, so the master node is clean + * 2. UBIFS is being mounted R/W and starts changing the master + * node in the first (%UBIFS_MST_LNUM). A power cut happens, + * so this LEB ends up with some amount of garbage at the + * end. + * 3. UBIFS is being mounted R/O. We reach this place and + * recover the master node from the second LEB + * (%UBIFS_MST_LNUM + 1). But we cannot update the media + * because we are being mounted R/O. We have to defer the + * operation. + * 4. However, this master node (@c->mst_node) is marked as + * clean (since the step 1). And if we just return, the + * mount code will be confused and won't recover the master + * node when it is re-mounter R/W later. + * + * Thus, to force the recovery by marking the master node as + * dirty. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); } else { /* Write the recovered master node */ c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 96cb62c8a9dd..f75f89b7867a 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -475,7 +475,8 @@ failed: * @c: UBIFS file-system description object * * This function returns a pointer to the superblock node or a negative error - * code. + * code. Note, the user of this function is responsible of kfree()'ing the + * returned superblock buffer. */ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) { diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 0b201114a5ad..5306c9195ed8 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -283,7 +283,11 @@ int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); if (nr == 0) - return clean_zn_cnt; + /* + * Due to the way UBIFS updates the clean znode counter it may + * temporarily be negative. + */ + return clean_zn_cnt >= 0 ? clean_zn_cnt : 1; if (!clean_zn_cnt) { /* diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 4d2f2157dd3f..f4d0c3686d23 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1545,6 +1545,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) } sup->leb_cnt = cpu_to_le32(c->leb_cnt); err = ubifs_write_sb_node(c, sup); + kfree(sup); if (err) goto out; } diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 34640d6dbdcb..f9f5567bf896 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -275,8 +275,11 @@ xfs_end_io( xfs_finish_ioend(ioend, 0); /* ensure we don't spin on blocked ioends */ delay(1); - } else + } else { + if (ioend->io_iocb) + aio_complete(ioend->io_iocb, ioend->io_result, 0); xfs_destroy_ioend(ioend); + } } /* @@ -309,6 +312,8 @@ xfs_alloc_ioend( atomic_inc(&XFS_I(ioend->io_inode)->i_iocount); ioend->io_offset = 0; ioend->io_size = 0; + ioend->io_iocb = NULL; + ioend->io_result = 0; INIT_WORK(&ioend->io_work, xfs_end_io); return ioend; @@ -1599,9 +1604,12 @@ xfs_end_io_direct( struct kiocb *iocb, loff_t offset, ssize_t size, - void *private) + void *private, + int ret, + bool is_async) { xfs_ioend_t *ioend = iocb->private; + bool complete_aio = is_async; /* * Non-NULL private data means we need to issue a transaction to @@ -1627,7 +1635,14 @@ xfs_end_io_direct( if (ioend->io_type == IO_READ) { xfs_finish_ioend(ioend, 0); } else if (private && size > 0) { - xfs_finish_ioend(ioend, is_sync_kiocb(iocb)); + if (is_async) { + ioend->io_iocb = iocb; + ioend->io_result = ret; + complete_aio = false; + xfs_finish_ioend(ioend, 0); + } else { + xfs_finish_ioend(ioend, 1); + } } else { /* * A direct I/O write ioend starts it's life in unwritten @@ -1645,6 +1660,9 @@ xfs_end_io_direct( * against double-freeing. */ iocb->private = NULL; + + if (complete_aio) + aio_complete(iocb, ret, 0); } STATIC ssize_t diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 4cfc6ea87df8..9f566d92ae3a 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -37,6 +37,8 @@ typedef struct xfs_ioend { size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct kiocb *io_iocb; + int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index e59a81062830..0a3eb18fc70c 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -703,14 +703,19 @@ xfs_ioc_fsgeometry_v1( xfs_mount_t *mp, void __user *arg) { - xfs_fsop_geom_v1_t fsgeo; + xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); + error = xfs_fs_geometry(mp, &fsgeo, 3); if (error) return -error; - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) return -XFS_ERROR(EFAULT); return 0; } @@ -794,6 +799,8 @@ xfs_ioc_fsgetxattr( { struct fsxattr fa; + memset(&fa, 0, sizeof(struct fsxattr)); + xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index a51a07c3a70c..f38037d83846 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -711,13 +711,10 @@ xfs_inode_set_reclaim_tag( } void -__xfs_inode_clear_reclaim_tag( - xfs_mount_t *mp, +__xfs_inode_clear_reclaim( xfs_perag_t *pag, xfs_inode_t *ip) { - radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); pag->pag_ici_reclaimable--; if (!pag->pag_ici_reclaimable) { /* clear the reclaim tag from the perag radix tree */ @@ -731,6 +728,17 @@ __xfs_inode_clear_reclaim_tag( } } +void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + /* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index e28139aaa4aa..e4f7e1f2ab0f 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -47,6 +47,7 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); +void __xfs_inode_clear_reclaim(struct xfs_perag *pag, struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 37a6f62c57b6..4e7f02b22d1d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -57,6 +57,9 @@ xfs_fs_geometry( xfs_fsop_geom_t *geo, int new_version) { + + memset(geo, 0, sizeof(*geo)); + geo->blocksize = mp->m_sb.sb_blocksize; geo->rtextsize = mp->m_sb.sb_rextsize; geo->agblocks = mp->m_sb.sb_agblocks; diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index c7142a064c48..eb779affaaaa 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1217,7 +1217,6 @@ xfs_imap_lookup( struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - xfs_agino_t startino; int error; int i; @@ -1231,13 +1230,13 @@ xfs_imap_lookup( } /* - * derive and lookup the exact inode record for the given agino. If the - * record cannot be found, then it's an invalid inode number and we - * should abort. + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. */ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); - error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); @@ -1250,6 +1249,11 @@ xfs_imap_lookup( if (error) return error; + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + return EINVAL; + /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 8f8b91be2c99..a8bb8c2d48cb 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -492,6 +492,7 @@ xfs_ireclaim( write_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, agino)) ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); write_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b76a829d7e20..f70221820a12 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1927,6 +1927,11 @@ xfs_iunlink_remove( return 0; } +/* + * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -1958,8 +1963,6 @@ xfs_ifree_cluster( } for (j = 0; j < nbufs; j++, inum += ninodes) { - int found = 0; - blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -1978,7 +1981,9 @@ xfs_ifree_cluster( /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. */ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); while (lip) { @@ -1990,11 +1995,11 @@ xfs_ifree_cluster( &iip->ili_flush_lsn, &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - found++; } lip = lip->li_bio_list; } + /* * For each inode in memory attempt to add it to the inode * buffer and set it up for being staled on buffer IO @@ -2006,6 +2011,7 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < ninodes; i++) { +retry: read_lock(&pag->pag_ici_lock); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); @@ -2016,38 +2022,36 @@ xfs_ifree_cluster( continue; } - /* don't try to lock/unlock the current inode */ + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ if (ip != free_ip && !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { read_unlock(&pag->pag_ici_lock); - continue; + delay(1); + goto retry; } read_unlock(&pag->pag_ici_lock); - if (!xfs_iflock_nowait(ip)) { - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - + xfs_iflock(ip); xfs_iflags_set(ip, XFS_ISTALE); - if (xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ iip = ip->i_itemp; - if (!iip) { - /* inode with unlogged changes only */ + if (!iip || xfs_inode_clean(ip)) { ASSERT(ip != free_ip); ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - found++; iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; @@ -2063,8 +2067,7 @@ xfs_ifree_cluster( xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (found) - xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } |