diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-08-01 10:26:23 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-08-01 10:26:23 -0700 |
commit | a0e881b7c189fa2bd76c024dbff91e79511c971d (patch) | |
tree | 0c801918565b08921d21aceee5b326f64d998f5f | |
parent | eff0d13f3823f35d70228cd151d2a2c89288ff32 (diff) | |
parent | dbc6e0222d79e78925fe20733844a796a4b72cf9 (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull second vfs pile from Al Viro:
"The stuff in there: fsfreeze deadlock fixes by Jan (essentially, the
deadlock reproduced by xfstests 068), symlink and hardlink restriction
patches, plus assorted cleanups and fixes.
Note that another fsfreeze deadlock (emergency thaw one) is *not*
dealt with - the series by Fernando conflicts a lot with Jan's, breaks
userland ABI (FIFREEZE semantics gets changed) and trades the deadlock
for massive vfsmount leak; this is going to be handled next cycle.
There probably will be another pull request, but that stuff won't be
in it."
Fix up trivial conflicts due to unrelated changes next to each other in
drivers/{staging/gdm72xx/usb_boot.c, usb/gadget/storage_common.c}
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (54 commits)
delousing target_core_file a bit
Documentation: Correct s_umount state for freeze_fs/unfreeze_fs
fs: Remove old freezing mechanism
ext2: Implement freezing
btrfs: Convert to new freezing mechanism
nilfs2: Convert to new freezing mechanism
ntfs: Convert to new freezing mechanism
fuse: Convert to new freezing mechanism
gfs2: Convert to new freezing mechanism
ocfs2: Convert to new freezing mechanism
xfs: Convert to new freezing code
ext4: Convert to new freezing mechanism
fs: Protect write paths by sb_start_write - sb_end_write
fs: Skip atime update on frozen filesystem
fs: Add freezing handling to mnt_want_write() / mnt_drop_write()
fs: Improve filesystem freezing handling
switch the protection of percpu_counter list to spinlock
nfsd: Push mnt_want_write() outside of i_mutex
btrfs: Push mnt_want_write() outside of i_mutex
fat: Push mnt_want_write() outside of i_mutex
...
84 files changed, 1326 insertions, 639 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 7f647e17830c..0f103e39b4f6 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -138,8 +138,8 @@ evict_inode: put_super: write write_super: read sync_fs: read -freeze_fs: read -unfreeze_fs: read +freeze_fs: write +unfreeze_fs: write statfs: maybe(read) (see below) remount_fs: write umount_begin: no diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 8c235b6e4246..88152f214f48 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs: - nr_open - overflowuid - overflowgid +- protected_hardlinks +- protected_symlinks - suid_dumpable - super-max - super-nr @@ -157,6 +159,46 @@ The default is 65534. ============================================================== +protected_hardlinks: + +A long-standing class of security issues is the hardlink-based +time-of-check-time-of-use race, most commonly seen in world-writable +directories like /tmp. The common method of exploitation of this flaw +is to cross privilege boundaries when following a given hardlink (i.e. a +root process follows a hardlink created by another user). Additionally, +on systems without separated partitions, this stops unauthorized users +from "pinning" vulnerable setuid/setgid files against being upgraded by +the administrator, or linking to special files. + +When set to "0", hardlink creation behavior is unrestricted. + +When set to "1" hardlinks cannot be created by users if they do not +already own the source file, or do not have read/write access to it. + +This protection is based on the restrictions in Openwall and grsecurity. + +============================================================== + +protected_symlinks: + +A long-standing class of security issues is the symlink-based +time-of-check-time-of-use race, most commonly seen in world-writable +directories like /tmp. The common method of exploitation of this flaw +is to cross privilege boundaries when following a given symlink (i.e. a +root process follows a symlink belonging to another user). For a likely +incomplete list of hundreds of examples across the years, please see: +http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp + +When set to "0", symlink following behavior is unrestricted. + +When set to "1" symlinks are permitted to be followed only when outside +a sticky world-writable directory, or when the uid of the symlink and +follower match, or when the directory owner matches the symlink's owner. + +This protection is based on the restrictions in Openwall and grsecurity. + +============================================================== + suid_dumpable: This value can be used to query and set the core dump mode for setuid diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index d544d7816df3..dba1ce235da5 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -186,10 +186,13 @@ static void spufs_prune_dir(struct dentry *dir) static int spufs_rmdir(struct inode *parent, struct dentry *dir) { /* remove all entries */ + int res; spufs_prune_dir(dir); d_drop(dir); - - return simple_rmdir(parent, dir); + res = simple_rmdir(parent, dir); + /* We have to give up the mm_struct */ + spu_forget(SPUFS_I(dir->d_inode)->i_ctx); + return res; } static int spufs_fill_dir(struct dentry *dir, @@ -245,9 +248,6 @@ static int spufs_dir_close(struct inode *inode, struct file *file) mutex_unlock(&parent->i_mutex); WARN_ON(ret); - /* We have to give up the mm_struct */ - spu_forget(ctx); - return dcache_dir_close(inode, file); } @@ -450,28 +450,24 @@ spufs_create_context(struct inode *inode, struct dentry *dentry, struct spu_context *neighbor; struct path path = {.mnt = mnt, .dentry = dentry}; - ret = -EPERM; if ((flags & SPU_CREATE_NOSCHED) && !capable(CAP_SYS_NICE)) - goto out_unlock; + return -EPERM; - ret = -EINVAL; if ((flags & (SPU_CREATE_NOSCHED | SPU_CREATE_ISOLATE)) == SPU_CREATE_ISOLATE) - goto out_unlock; + return -EINVAL; - ret = -ENODEV; if ((flags & SPU_CREATE_ISOLATE) && !isolated_loader) - goto out_unlock; + return -ENODEV; gang = NULL; neighbor = NULL; affinity = flags & (SPU_CREATE_AFFINITY_MEM | SPU_CREATE_AFFINITY_SPU); if (affinity) { gang = SPUFS_I(inode)->i_gang; - ret = -EINVAL; if (!gang) - goto out_unlock; + return -EINVAL; mutex_lock(&gang->aff_mutex); neighbor = spufs_assert_affinity(flags, gang, aff_filp); if (IS_ERR(neighbor)) { @@ -492,22 +488,12 @@ spufs_create_context(struct inode *inode, struct dentry *dentry, } ret = spufs_context_open(&path); - if (ret < 0) { + if (ret < 0) WARN_ON(spufs_rmdir(inode, dentry)); - if (affinity) - mutex_unlock(&gang->aff_mutex); - mutex_unlock(&inode->i_mutex); - spu_forget(SPUFS_I(dentry->d_inode)->i_ctx); - goto out; - } out_aff_unlock: if (affinity) mutex_unlock(&gang->aff_mutex); -out_unlock: - mutex_unlock(&inode->i_mutex); -out: - dput(dentry); return ret; } @@ -580,18 +566,13 @@ static int spufs_create_gang(struct inode *inode, int ret; ret = spufs_mkgang(inode, dentry, mode & S_IRWXUGO); - if (ret) - goto out; - - ret = spufs_gang_open(&path); - if (ret < 0) { - int err = simple_rmdir(inode, dentry); - WARN_ON(err); + if (!ret) { + ret = spufs_gang_open(&path); + if (ret < 0) { + int err = simple_rmdir(inode, dentry); + WARN_ON(err); + } } - -out: - mutex_unlock(&inode->i_mutex); - dput(dentry); return ret; } @@ -601,40 +582,32 @@ static struct file_system_type spufs_type; long spufs_create(struct path *path, struct dentry *dentry, unsigned int flags, umode_t mode, struct file *filp) { + struct inode *dir = path->dentry->d_inode; int ret; - ret = -EINVAL; /* check if we are on spufs */ if (path->dentry->d_sb->s_type != &spufs_type) - goto out; + return -EINVAL; /* don't accept undefined flags */ if (flags & (~SPU_CREATE_FLAG_ALL)) - goto out; + return -EINVAL; /* only threads can be underneath a gang */ - if (path->dentry != path->dentry->d_sb->s_root) { - if ((flags & SPU_CREATE_GANG) || - !SPUFS_I(path->dentry->d_inode)->i_gang) - goto out; - } + if (path->dentry != path->dentry->d_sb->s_root) + if ((flags & SPU_CREATE_GANG) || !SPUFS_I(dir)->i_gang) + return -EINVAL; mode &= ~current_umask(); if (flags & SPU_CREATE_GANG) - ret = spufs_create_gang(path->dentry->d_inode, - dentry, path->mnt, mode); + ret = spufs_create_gang(dir, dentry, path->mnt, mode); else - ret = spufs_create_context(path->dentry->d_inode, - dentry, path->mnt, flags, mode, + ret = spufs_create_context(dir, dentry, path->mnt, flags, mode, filp); if (ret >= 0) - fsnotify_mkdir(path->dentry->d_inode, dentry); - return ret; + fsnotify_mkdir(dir, dentry); -out: - mutex_unlock(&path->dentry->d_inode->i_mutex); - dput(dentry); return ret; } diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c index 5665dcc382c7..5b7d8ffbf890 100644 --- a/arch/powerpc/platforms/cell/spufs/syscalls.c +++ b/arch/powerpc/platforms/cell/spufs/syscalls.c @@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags, ret = PTR_ERR(dentry); if (!IS_ERR(dentry)) { ret = spufs_create(&path, dentry, flags, mode, neighbor); - path_put(&path); + done_path_create(&path, dentry); } return ret; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index d91a3a0b2325..deb4a456cf83 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -156,9 +156,7 @@ static int dev_mkdir(const char *name, umode_t mode) if (!err) /* mark as kernel-created inode */ dentry->d_inode->i_private = &thread; - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return err; } @@ -218,10 +216,7 @@ static int handle_create(const char *nodename, umode_t mode, struct device *dev) /* mark as kernel-created inode */ dentry->d_inode->i_private = &thread; } - dput(dentry); - - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + done_path_create(&path, dentry); return err; } diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c index 57bf1d7ee80f..9ab24528f9b9 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_linux.c @@ -1188,7 +1188,7 @@ exit: kfree(buf); /* close file before return */ if (fp) - filp_close(fp, current->files); + filp_close(fp, NULL); /* restore previous address limit */ set_fs(old_fs); diff --git a/drivers/staging/bcm/Misc.c b/drivers/staging/bcm/Misc.c index 9a60d4cd2184..f545716c666d 100644 --- a/drivers/staging/bcm/Misc.c +++ b/drivers/staging/bcm/Misc.c @@ -157,12 +157,7 @@ static int create_worker_threads(struct bcm_mini_adapter *psAdapter) static struct file *open_firmware_file(struct bcm_mini_adapter *Adapter, const char *path) { - struct file *flp = NULL; - mm_segment_t oldfs; - oldfs = get_fs(); - set_fs(get_ds()); - flp = filp_open(path, O_RDONLY, S_IRWXU); - set_fs(oldfs); + struct file *flp = filp_open(path, O_RDONLY, S_IRWXU); if (IS_ERR(flp)) { pr_err(DRV_NAME "Unable To Open File %s, err %ld", path, PTR_ERR(flp)); flp = NULL; @@ -183,14 +178,12 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u { int errorno = 0; struct file *flp = NULL; - mm_segment_t oldfs; struct timeval tv = {0}; flp = open_firmware_file(Adapter, path); if (!flp) { - errorno = -ENOENT; BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Unable to Open %s\n", path); - goto exit_download; + return -ENOENT; } BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Opened file is = %s and length =0x%lx to be downloaded at =0x%x", path, (unsigned long)flp->f_dentry->d_inode->i_size, loc); do_gettimeofday(&tv); @@ -201,10 +194,7 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u errorno = -EIO; goto exit_download; } - oldfs = get_fs(); - set_fs(get_ds()); vfs_llseek(flp, 0, 0); - set_fs(oldfs); if (Adapter->bcm_file_readback_from_chip(Adapter->pvInterfaceAdapter, flp, loc)) { BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Failed to read back firmware!"); errorno = -EIO; @@ -212,12 +202,7 @@ static int BcmFileDownload(struct bcm_mini_adapter *Adapter, const char *path, u } exit_download: - oldfs = get_fs(); - set_fs(get_ds()); - if (flp && !(IS_ERR(flp))) - filp_close(flp, current->files); - set_fs(oldfs); - + filp_close(flp, NULL); return errorno; } @@ -1056,10 +1041,8 @@ OUT: static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter) { struct file *flp = NULL; - mm_segment_t oldfs = {0}; char *buff; int len = 0; - loff_t pos = 0; buff = kmalloc(BUFFER_1K, GFP_KERNEL); if (!buff) @@ -1079,20 +1062,16 @@ static int bcm_parse_target_params(struct bcm_mini_adapter *Adapter) Adapter->pstargetparams = NULL; return -ENOENT; } - oldfs = get_fs(); - set_fs(get_ds()); - len = vfs_read(flp, (void __user __force *)buff, BUFFER_1K, &pos); - set_fs(oldfs); + len = kernel_read(flp, 0, buff, BUFFER_1K); + filp_close(flp, NULL); if (len != sizeof(STARGETPARAMS)) { BCM_DEBUG_PRINT(Adapter, DBG_TYPE_INITEXIT, MP_INIT, DBG_LVL_ALL, "Mismatch in Target Param Structure!\n"); kfree(buff); kfree(Adapter->pstargetparams); Adapter->pstargetparams = NULL; - filp_close(flp, current->files); return -ENOENT; } - filp_close(flp, current->files); /* Check for autolink in config params */ /* diff --git a/drivers/staging/gdm72xx/sdio_boot.c b/drivers/staging/gdm72xx/sdio_boot.c index 760efee23d4a..65624bca8b3a 100644 --- a/drivers/staging/gdm72xx/sdio_boot.c +++ b/drivers/staging/gdm72xx/sdio_boot.c @@ -66,9 +66,8 @@ static int download_image(struct sdio_func *func, char *img_name) return -ENOENT; } - if (filp->f_dentry) - inode = filp->f_dentry->d_inode; - if (!inode || !S_ISREG(inode->i_mode)) { + inode = filp->f_dentry->d_inode; + if (!S_ISREG(inode->i_mode)) { printk(KERN_ERR "Invalid file type: %s\n", img_name); ret = -EINVAL; goto out; @@ -123,7 +122,7 @@ static int download_image(struct sdio_func *func, char *img_name) pno++; } out: - filp_close(filp, current->files); + filp_close(filp, NULL); return ret; } diff --git a/drivers/staging/gdm72xx/usb_boot.c b/drivers/staging/gdm72xx/usb_boot.c index fef290c38db6..e3dbd5a552ca 100644 --- a/drivers/staging/gdm72xx/usb_boot.c +++ b/drivers/staging/gdm72xx/usb_boot.c @@ -173,14 +173,12 @@ int usb_boot(struct usb_device *usbdev, u16 pid) filp = filp_open(img_name, O_RDONLY | O_LARGEFILE, 0); if (IS_ERR(filp)) { printk(KERN_ERR "Can't find %s.\n", img_name); - set_fs(fs); ret = PTR_ERR(filp); goto restore_fs; } - if (filp->f_dentry) - inode = filp->f_dentry->d_inode; - if (!inode || !S_ISREG(inode->i_mode)) { + inode = filp->f_dentry->d_inode; + if (!S_ISREG(inode->i_mode)) { printk(KERN_ERR "Invalid file type: %s\n", img_name); ret = -EINVAL; goto out; @@ -262,7 +260,7 @@ int usb_boot(struct usb_device *usbdev, u16 pid) ret = -EINVAL; } out: - filp_close(filp, current->files); + filp_close(filp, NULL); restore_fs: set_fs(fs); @@ -322,13 +320,11 @@ static int em_download_image(struct usb_device *usbdev, char *path, goto restore_fs; } - if (filp->f_dentry) { - inode = filp->f_dentry->d_inode; - if (!inode || !S_ISREG(inode->i_mode)) { - printk(KERN_ERR "Invalid file type: %s\n", path); - ret = -EINVAL; - goto out; - } + inode = filp->f_dentry->d_inode; + if (!S_ISREG(inode->i_mode)) { + printk(KERN_ERR "Invalid file type: %s\n", path); + ret = -EINVAL; + goto out; } buf = kmalloc(DOWNLOAD_CHUCK + pad_size, GFP_KERNEL); @@ -364,7 +360,7 @@ static int em_download_image(struct usb_device *usbdev, char *path, goto out; out: - filp_close(filp, current->files); + filp_close(filp, NULL); restore_fs: set_fs(fs); diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 9e2100551c78..cbb5aaf3e567 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -109,46 +109,29 @@ static struct se_device *fd_create_virtdevice( struct se_subsystem_dev *se_dev, void *p) { - char *dev_p = NULL; struct se_device *dev; struct se_dev_limits dev_limits; struct queue_limits *limits; struct fd_dev *fd_dev = p; struct fd_host *fd_host = hba->hba_ptr; - mm_segment_t old_fs; struct file *file; struct inode *inode = NULL; int dev_flags = 0, flags, ret = -EINVAL; memset(&dev_limits, 0, sizeof(struct se_dev_limits)); - old_fs = get_fs(); - set_fs(get_ds()); - dev_p = getname(fd_dev->fd_dev_name); - set_fs(old_fs); - - if (IS_ERR(dev_p)) { - pr_err("getname(%s) failed: %lu\n", - fd_dev->fd_dev_name, IS_ERR(dev_p)); - ret = PTR_ERR(dev_p); - goto fail; - } /* * Use O_DSYNC by default instead of O_SYNC to forgo syncing * of pure timestamp updates. */ flags = O_RDWR | O_CREAT | O_LARGEFILE | O_DSYNC; - file = filp_open(dev_p, flags, 0600); + file = filp_open(fd_dev->fd_dev_name, flags, 0600); if (IS_ERR(file)) { - pr_err("filp_open(%s) failed\n", dev_p); + pr_err("filp_open(%s) failed\n", fd_dev->fd_dev_name); ret = PTR_ERR(file); goto fail; } - if (!file || !file->f_dentry) { - pr_err("filp_open(%s) failed\n", dev_p); - goto fail; - } fd_dev->fd_file = file; /* * If using a block backend with this struct file, we extract @@ -212,14 +195,12 @@ static struct se_device *fd_create_virtdevice( " %llu total bytes\n", fd_host->fd_host_id, fd_dev->fd_dev_id, fd_dev->fd_dev_name, fd_dev->fd_dev_size); - putname(dev_p); return dev; fail: if (fd_dev->fd_file) { filp_close(fd_dev->fd_file, NULL); fd_dev->fd_file = NULL; } - putname(dev_p); return ERR_PTR(ret); } @@ -452,14 +433,11 @@ static ssize_t fd_set_configfs_dev_params( token = match_token(ptr, tokens, args); switch (token) { case Opt_fd_dev_name: - arg_p = match_strdup(&args[0]); - if (!arg_p) { - ret = -ENOMEM; + if (match_strlcpy(fd_dev->fd_dev_name, &args[0], + FD_MAX_DEV_NAME) == 0) { + ret = -EINVAL; break; } - snprintf(fd_dev->fd_dev_name, FD_MAX_DEV_NAME, - "%s", arg_p); - kfree(arg_p); pr_debug("FILEIO: Referencing Path: %s\n", fd_dev->fd_dev_name); fd_dev->fbd_flags |= FBDF_HAS_PATH; diff --git a/drivers/usb/gadget/storage_common.c b/drivers/usb/gadget/storage_common.c index ae8b18869b8c..8d9bcd8207c8 100644 --- a/drivers/usb/gadget/storage_common.c +++ b/drivers/usb/gadget/storage_common.c @@ -656,9 +656,8 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (!(filp->f_mode & FMODE_WRITE)) ro = 1; - if (filp->f_path.dentry) - inode = filp->f_path.dentry->d_inode; - if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { + inode = filp->f_path.dentry->d_inode; + if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { LINFO(curlun, "invalid file type: %s\n", filename); goto out; } @@ -667,7 +666,7 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename) * If we can't read the file, it's no good. * If we can't write the file, use it read-only. */ - if (!filp->f_op || !(filp->f_op->read || filp->f_op->aio_read)) { + if (!(filp->f_op->read || filp->f_op->aio_read)) { LINFO(curlun, "file not readable: %s\n", filename); goto out; } @@ -712,7 +711,6 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (fsg_lun_is_open(curlun)) fsg_lun_close(curlun); - get_file(filp); curlun->blksize = blksize; curlun->blkbits = blkbits; curlun->ro = ro; @@ -720,10 +718,10 @@ static int fsg_lun_open(struct fsg_lun *curlun, const char *filename) curlun->file_length = size; curlun->num_sectors = num_sectors; LDBG(curlun, "open backing file: %s\n", filename); - rc = 0; + return 0; out: - filp_close(filp, current->files); + fput(filp); return rc; } diff --git a/drivers/usb/gadget/u_uac1.c b/drivers/usb/gadget/u_uac1.c index af9898982059..e0c5e88e03ed 100644 --- a/drivers/usb/gadget/u_uac1.c +++ b/drivers/usb/gadget/u_uac1.c @@ -275,17 +275,17 @@ static int gaudio_close_snd_dev(struct gaudio *gau) /* Close control device */ snd = &gau->control; if (snd->filp) - filp_close(snd->filp, current->files); + filp_close(snd->filp, NULL); /* Close PCM playback device and setup substream */ snd = &gau->playback; if (snd->filp) - filp_close(snd->filp, current->files); + filp_close(snd->filp, NULL); /* Close PCM capture device and setup substream */ snd = &gau->capture; if (snd->filp) - filp_close(snd->filp, current->files); + filp_close(snd->filp, NULL); return 0; } diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c index 1ddeb11659d4..64cda560c488 100644 --- a/drivers/video/fb_defio.c +++ b/drivers/video/fb_defio.c @@ -104,6 +104,8 @@ static int fb_deferred_io_mkwrite(struct vm_area_struct *vma, deferred framebuffer IO. then if userspace touches a page again, we repeat the same scheme */ + file_update_time(vma->vm_file); + /* protect against the workqueue changing the page list */ mutex_lock(&fbdefio->lock); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index fc06fd27065e..dd6f7ee1e312 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", page, (unsigned long)filp->private_data); + /* Update file times before taking page lock */ + file_update_time(filp); + v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ v9fs_fscache_wait_on_page_write(inode, page); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fadeba6a5db9..62e0cafd6e25 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1614,8 +1614,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { btrfs_run_delayed_iputs(root); @@ -1647,7 +1645,6 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; delay = HZ * 30; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); spin_lock(&root->fs_info->trans_lock); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9aa01ec2138d..5caf285c6e4d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, ssize_t err = 0; size_t count, ocount; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); @@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = err; } out: + sb_end_write(inode->i_sb); current->backing_dev_info = NULL; return num_written ? num_written : err; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 48bdfd2591c2..83baec24946d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6629,6 +6629,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; + sb_start_pagefault(inode->i_sb); ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); @@ -6718,12 +6719,15 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - if (!ret) + if (!ret) { + sb_end_pagefault(inode->i_sb); return VM_FAULT_LOCKED; + } unlock_page(page); out: btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out_noreserve: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 43f0012016e3..bc2f6ffff3cf 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -195,6 +195,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(file); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); ip_oldflags = ip->flags; @@ -209,10 +213,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } - ret = mnt_want_write_file(file); - if (ret) - goto out_unlock; - if (flags & FS_SYNC_FL) ip->flags |= BTRFS_INODE_SYNC; else @@ -275,9 +275,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode->i_flags = i_oldflags; } - mnt_drop_write_file(file); out_unlock: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); return ret; } @@ -664,6 +664,10 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; + error = mnt_want_write(parent->mnt); + if (error) + return error; + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, parent->dentry, namelen); @@ -699,6 +703,7 @@ out_dput: dput(dentry); out_unlock: mutex_unlock(&dir->i_mutex); + mnt_drop_write(parent->mnt); return error; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7ac7cdcc294e..17be3dedacba 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -335,6 +335,8 @@ again: if (!h) return ERR_PTR(-ENOMEM); + sb_start_intwrite(root->fs_info->sb); + if (may_wait_transaction(root, type)) wait_current_trans(root); @@ -345,6 +347,7 @@ again: } while (ret == -EBUSY); if (ret < 0) { + sb_end_intwrite(root->fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); return ERR_PTR(ret); } @@ -548,6 +551,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; + sb_end_intwrite(root->fs_info->sb); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root)) { trans->transaction->blocked = 1; @@ -1578,6 +1583,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); + sb_end_intwrite(root->fs_info->sb); + trace_btrfs_transaction_commit(root); btrfs_scrub_continue(root); diff --git a/fs/buffer.c b/fs/buffer.c index c7062c896d7c..9f6d2e41281d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2306,8 +2306,8 @@ EXPORT_SYMBOL(block_commit_write); * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. * - * Direct callers of this function should call vfs_check_frozen() so that page - * fault does not busyloop until the fs is thawed. + * Direct callers of this function should protect against filesystem freezing + * using sb_start_write() - sb_end_write() functions. */ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) @@ -2318,6 +2318,12 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, loff_t size; int ret; + /* + * Update file times before taking page lock. We may end up failing the + * fault so this update may be superfluous but who really cares... + */ + file_update_time(vma->vm_file); + lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || @@ -2339,18 +2345,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret < 0)) goto out_unlock; - /* - * Freezing in progress? We check after the page is marked dirty and - * with page lock held so if the test here fails, we are sure freezing - * code will wait during syncing until the page fault is done - at that - * point page will be dirty and unlocked so freezing code will write it - * and writeprotect it again. - */ set_page_dirty(page); - if (inode->i_sb->s_frozen != SB_UNFROZEN) { - ret = -EAGAIN; - goto out_unlock; - } wait_on_page_writeback(page); return 0; out_unlock: @@ -2365,12 +2360,9 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, int ret; struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb; - /* - * This check is racy but catches the common case. The check in - * __block_page_mkwrite() is reliable. - */ - vfs_check_frozen(sb, SB_FREEZE_WRITE); + sb_start_pagefault(sb); ret = __block_page_mkwrite(vma, vmf, get_block); + sb_end_pagefault(sb); return block_page_mkwrite_return(ret); } EXPORT_SYMBOL(block_page_mkwrite); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 8b67304e4b80..452e71a1b753 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1184,6 +1184,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size, len; int ret; + /* Update time before taking page lock */ + file_update_time(vma->vm_file); + size = i_size_read(inode); if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ffa2be57804d..c3ca12c33ca2 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -318,21 +318,20 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, struct vfsmount *lower_mnt; int rc = 0; - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); - BUG_ON(!lower_dentry->d_count); - dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - ecryptfs_set_dentry_private(dentry, dentry_info); if (!dentry_info) { printk(KERN_ERR "%s: Out of memory whilst attempting " "to allocate ecryptfs_dentry_info struct\n", __func__); dput(lower_dentry); - mntput(lower_mnt); - d_drop(dentry); return -ENOMEM; } + + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); + fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); + BUG_ON(!lower_dentry->d_count); + + ecryptfs_set_dentry_private(dentry, dentry_info); ecryptfs_set_dentry_lower(dentry, lower_dentry); ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); @@ -381,12 +380,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, struct dentry *lower_dir_dentry, *lower_dentry; int rc = 0; - if ((ecryptfs_dentry->d_name.len == 1 - && !strcmp(ecryptfs_dentry->d_name.name, ".")) - || (ecryptfs_dentry->d_name.len == 2 - && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { - goto out_d_drop; - } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, @@ -397,8 +390,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, - encrypted_and_encoded_name); - goto out_d_drop; + ecryptfs_dentry->d_name.name); + goto out; } if (lower_dentry->d_inode) goto interpose; @@ -415,7 +408,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " "filename; rc = [%d]\n", __func__, rc); - goto out_d_drop; + goto out; } mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, @@ -427,14 +420,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " "[%d] on lower_dentry = [%s]\n", __func__, rc, encrypted_and_encoded_name); - goto out_d_drop; + goto out; } interpose: rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry, ecryptfs_dir_inode); - goto out; -out_d_drop: - d_drop(ecryptfs_dentry); out: kfree(encrypted_and_encoded_name); return ERR_PTR(rc); diff --git a/fs/exec.c b/fs/exec.c index 3684353ebd5f..574cf4de4ec3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2069,25 +2069,18 @@ static void wait_for_dump_helpers(struct file *file) */ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { - struct file *rp, *wp; + struct file *files[2]; struct fdtable *fdt; struct coredump_params *cp = (struct coredump_params *)info->data; struct files_struct *cf = current->files; + int err = create_pipe_files(files, 0); + if (err) + return err; - wp = create_write_pipe(0); - if (IS_ERR(wp)) - return PTR_ERR(wp); - - rp = create_read_pipe(wp, 0); - if (IS_ERR(rp)) { - free_write_pipe(wp); - return PTR_ERR(rp); - } - - cp->file = wp; + cp->file = files[1]; sys_close(0); - fd_install(0, rp); + fd_install(0, files[0]); spin_lock(&cf->file_lock); fdt = files_fdtable(cf); __set_open_fd(0, fdt); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 264d315f6c47..6363ac66fafa 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -79,6 +79,7 @@ void ext2_evict_inode(struct inode * inode) truncate_inode_pages(&inode->i_data, 0); if (want_delete) { + sb_start_intwrite(inode->i_sb); /* set dtime */ EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); @@ -98,8 +99,10 @@ void ext2_evict_inode(struct inode * inode) if (unlikely(rsv)) kfree(rsv); - if (want_delete) + if (want_delete) { ext2_free_inode(inode); + sb_end_intwrite(inode->i_sb); + } } typedef struct { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9f311d27b16f..af74d9e27b71 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -42,6 +42,8 @@ static void ext2_sync_super(struct super_block *sb, static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); +static int ext2_freeze(struct super_block *sb); +static int ext2_unfreeze(struct super_block *sb); void ext2_error(struct super_block *sb, const char *function, const char *fmt, ...) @@ -305,6 +307,8 @@ static const struct super_operations ext2_sops = { .evict_inode = ext2_evict_inode, .put_super = ext2_put_super, .sync_fs = ext2_sync_fs, + .freeze_fs = ext2_freeze, + .unfreeze_fs = ext2_unfreeze, .statfs = ext2_statfs, .remount_fs = ext2_remount, .show_options = ext2_show_options, @@ -1200,6 +1204,35 @@ static int ext2_sync_fs(struct super_block *sb, int wait) return 0; } +static int ext2_freeze(struct super_block *sb) +{ + struct ext2_sb_info *sbi = EXT2_SB(sb); + + /* + * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared + * because we have unattached inodes and thus filesystem is not fully + * consistent. + */ + if (atomic_long_read(&sb->s_remove_count)) { + ext2_sync_fs(sb, 1); + return 0; + } + /* Set EXT2_FS_VALID flag */ + spin_lock(&sbi->s_lock); + sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state); + spin_unlock(&sbi->s_lock); + ext2_sync_super(sb, sbi->s_es, 1); + + return 0; +} + +static int ext2_unfreeze(struct super_block *sb) +{ + /* Just write sb to clear EXT2_VALID_FS flag */ + ext2_write_super(sb); + + return 0; +} void ext2_write_super(struct super_block *sb) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 89b59cb7f9b8..6324f74e0342 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; + /* + * Protect us against freezing - iput() caller didn't have to have any + * protection against it + */ + sb_start_intwrite(inode->i_sb); handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); @@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode) * cleaned up. */ ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } @@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode) stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); + sb_end_intwrite(inode->i_sb); goto no_delete; } } @@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode) else ext4_free_inode(handle, inode); ext4_journal_stop(handle); + sb_end_intwrite(inode->i_sb); return; no_delete: ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ @@ -4779,11 +4787,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) get_block_t *get_block; int retries = 0; - /* - * This check is racy but catches the common case. We rely on - * __block_page_mkwrite() to do a reliable check. - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_pagefault(inode->i_sb); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -4851,5 +4855,6 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index f99a1311e847..fe7c63f4717e 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) { struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data); + /* + * We protect against freezing so that we don't create dirty buffers + * on frozen filesystem. + */ + sb_start_write(sb); ext4_mmp_csum_set(sb, mmp); mark_buffer_dirty(bh); lock_buffer(bh); @@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) get_bh(bh); submit_bh(WRITE_SYNC, bh); wait_on_buffer(bh); + sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) return 1; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2d51cd9af225..d76ec8277d3f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -331,33 +331,17 @@ static void ext4_put_nojournal(handle_t *handle) * journal_end calls result in the superblock being marked dirty, so * that sync() will call the filesystem's write_super callback if * appropriate. - * - * To avoid j_barrier hold in userspace when a user calls freeze(), - * ext4 prevents a new handle from being started by s_frozen, which - * is in an upper layer. */ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) { journal_t *journal; - handle_t *handle; trace_ext4_journal_start(sb, nblocks, _RET_IP_); if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; - handle = ext4_journal_current_handle(); - - /* - * If a handle has been started, it should be allowed to - * finish, otherwise deadlock could happen between freeze - * and others(e.g. truncate) due to the restart of the - * journal handle if the filesystem is forzen and active - * handles are not stopped. - */ - if (!handle) - vfs_check_frozen(sb, SB_FREEZE_TRANS); - if (!journal) return ext4_get_nojournal(); /* @@ -2747,6 +2731,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; + sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2773,6 +2758,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } + sb_end_write(sb); return ret; } @@ -4460,10 +4446,8 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) { - vfs_check_frozen(sb, SB_FREEZE_TRANS); + if (journal) ret = ext4_journal_force_commit(journal); - } return ret; } @@ -4493,9 +4477,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * gives us a chance to flush the journal completely and mark the fs clean. * * Note that only this function cannot bring a filesystem to be in a clean - * state independently, because ext4 prevents a new handle from being started - * by @sb->s_frozen, which stays in an upper layer. It thus needs help from - * the upper layer. + * state independently. It relies on upper layer to stop all data & metadata + * modifications. */ static int ext4_freeze(struct super_block *sb) { @@ -4522,7 +4505,7 @@ static int ext4_freeze(struct super_block *sb) EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: - /* we rely on s_frozen to stop further updates */ + /* we rely on upper layer to stop further updates */ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return error; } diff --git a/fs/fat/file.c b/fs/fat/file.c index a71fe3715ee8..e007b8bd8e5e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -43,10 +43,10 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) if (err) goto out; - mutex_lock(&inode->i_mutex); err = mnt_want_write_file(file); if (err) - goto out_unlock_inode; + goto out; + mutex_lock(&inode->i_mutex); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -73,14 +73,14 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) /* The root directory has no attributes */ if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) { err = -EINVAL; - goto out_drop_write; + goto out_unlock_inode; } if (sbi->options.sys_immutable && ((attr | oldattr) & ATTR_SYS) && !capable(CAP_LINUX_IMMUTABLE)) { err = -EPERM; - goto out_drop_write; + goto out_unlock_inode; } /* @@ -90,12 +90,12 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) */ err = security_inode_setattr(file->f_path.dentry, &ia); if (err) - goto out_drop_write; + goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ err = fat_setattr(file->f_path.dentry, &ia); if (err) - goto out_drop_write; + goto out_unlock_inode; fsnotify_change(file->f_path.dentry, ia.ia_valid); if (sbi->options.sys_immutable) { @@ -107,10 +107,9 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); -out_drop_write: - mnt_drop_write_file(file); out_unlock_inode: mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(file); out: return err; } diff --git a/fs/file_table.c b/fs/file_table.c index b3fc4d67a26b..701985e4ccda 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -43,7 +43,7 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static inline void file_free_rcu(struct rcu_head *head) +static void file_free_rcu(struct rcu_head *head) { struct file *f = container_of(head, struct file, f_u.fu_rcuhead); @@ -217,7 +217,7 @@ static void drop_file_write_access(struct file *file) return; if (file_check_writeable(file) != 0) return; - mnt_drop_write(mnt); + __mnt_drop_write(mnt); file_release_write(file); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b321a688cde7..93d8d6c9494d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -944,9 +944,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -1004,6 +1003,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, out: current->backing_dev_info = NULL; mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); return written ? written : err; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 9aa6af13823c..d1d791ef38de 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -373,11 +373,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) loff_t size; int ret; - /* Wait if fs is frozen. This is racy so we check again later on - * and retry if the fs has been frozen after the page lock has - * been acquired - */ - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_pagefault(inode->i_sb); + + /* Update file times before taking page lock */ + file_update_time(vma->vm_file); ret = gfs2_rs_alloc(ip); if (ret) @@ -462,14 +461,9 @@ out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); - /* This check must be post dropping of transaction lock */ - if (inode->i_sb->s_frozen == SB_UNFROZEN) { - wait_on_page_writeback(page); - } else { - ret = -EAGAIN; - unlock_page(page); - } + wait_on_page_writeback(page); } + sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index ad3e2fb763d7..adbd27875ef9 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -50,6 +50,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (revokes) tr->tr_reserved += gfs2_struct2blk(sdp, revokes, sizeof(u64)); + sb_start_intwrite(sdp->sd_vfs); gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh); error = gfs2_glock_nq(&tr->tr_t_gh); @@ -68,6 +69,7 @@ fail_gunlock: gfs2_glock_dq(&tr->tr_t_gh); fail_holder_uninit: + sb_end_intwrite(sdp->sd_vfs); gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); @@ -116,6 +118,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_holder_uninit(&tr->tr_t_gh); kfree(tr); } + sb_end_intwrite(sdp->sd_vfs); return; } @@ -136,6 +139,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL); + sb_end_intwrite(sdp->sd_vfs); } /** diff --git a/fs/inode.c b/fs/inode.c index 3cc504320467..ac8d904b3f16 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1542,9 +1542,11 @@ void touch_atime(struct path *path) if (timespec_equal(&inode->i_atime, &now)) return; - if (mnt_want_write(mnt)) + if (!sb_start_write_trylock(inode->i_sb)) return; + if (__mnt_want_write(mnt)) + goto skip_update; /* * File systems can error out when updating inodes if they need to * allocate new space to modify an inode (such is the case for @@ -1555,7 +1557,9 @@ void touch_atime(struct path *path) * of the fs read only, e.g. subvolumes in Btrfs. */ update_time(inode, &now, S_ATIME); - mnt_drop_write(mnt); + __mnt_drop_write(mnt); +skip_update: + sb_end_write(inode->i_sb); } EXPORT_SYMBOL(touch_atime); @@ -1662,11 +1666,11 @@ int file_update_time(struct file *file) return 0; /* Finally allowed to write? Takes lock. */ - if (mnt_want_write_file(file)) + if (__mnt_want_write_file(file)) return 0; ret = update_time(inode, &now, sync_it); - mnt_drop_write_file(file); + __mnt_drop_write_file(file); return ret; } diff --git a/fs/internal.h b/fs/internal.h index a6fd56c68b11..371bcc4b1697 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -61,6 +61,10 @@ extern void __init mnt_init(void); extern struct lglock vfsmount_lock; +extern int __mnt_want_write(struct vfsmount *); +extern int __mnt_want_write_file(struct file *); +extern void __mnt_drop_write(struct vfsmount *); +extern void __mnt_drop_write_file(struct file *); /* * fs_struct.c diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 8392cb85bd54..05d29124c6ab 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -156,12 +156,16 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) struct nlm_rqst *call; int status; - nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; nlmclnt_locks_init_private(fl, host); + if (!fl->fl_u.nfs_fl.owner) { + /* lockowner allocation has failed */ + nlmclnt_release_call(call); + return -ENOMEM; + } /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); @@ -185,9 +189,6 @@ EXPORT_SYMBOL_GPL(nlmclnt_proc); /* * Allocate an NLM RPC call struct - * - * Note: the caller must hold a reference to host. In case of failure, - * this reference will be released. */ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) { @@ -199,7 +200,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) atomic_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); - call->a_host = host; + call->a_host = nlm_get_host(host); return call; } if (signalled()) @@ -207,7 +208,6 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) printk("nlm_alloc_call: failed, waiting for memory\n"); schedule_timeout_interruptible(5*HZ); } - nlmclnt_release_host(host); return NULL; } @@ -750,7 +750,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" " Attempting to cancel lock.\n"); - req = nlm_alloc_call(nlm_get_host(host)); + req = nlm_alloc_call(host); if (!req) return -ENOMEM; req->a_flags = RPC_TASK_ASYNC; diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4a43d253c045..b147d1ae71fd 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -257,6 +257,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args return rpc_system_err; call = nlm_alloc_call(host); + nlmsvc_release_host(host); if (call == NULL) return rpc_system_err; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index afe4488c33d8..fb1a2bedbe97 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -219,7 +219,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, struct nlm_block *block; struct nlm_rqst *call = NULL; - nlm_get_host(host); call = nlm_alloc_call(host); if (call == NULL) return NULL; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index de8f2caa2235..3009a365e082 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -297,6 +297,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args return rpc_system_err; call = nlm_alloc_call(host); + nlmsvc_release_host(host); if (call == NULL) return rpc_system_err; diff --git a/fs/namei.c b/fs/namei.c index 2ccc35c4dc24..1b464390dde8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -650,6 +650,121 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki path_put(link); } +int sysctl_protected_symlinks __read_mostly = 1; +int sysctl_protected_hardlinks __read_mostly = 1; + +/** + * may_follow_link - Check symlink following for unsafe situations + * @link: The path of the symlink + * + * In the case of the sysctl_protected_symlinks sysctl being enabled, + * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is + * in a sticky world-writable directory. This is to protect privileged + * processes from failing races against path names that may change out + * from under them by way of other users creating malicious symlinks. + * It will permit symlinks to be followed only when outside a sticky + * world-writable directory, or when the uid of the symlink and follower + * match, or when the directory owner matches the symlink's owner. + * + * Returns 0 if following the symlink is allowed, -ve on error. + */ +static inline int may_follow_link(struct path *link, struct nameidata *nd) +{ + const struct inode *inode; + const struct inode *parent; + + if (!sysctl_protected_symlinks) + return 0; + + /* Allowed if owner and follower match. */ + inode = link->dentry->d_inode; + if (current_cred()->fsuid == inode->i_uid) + return 0; + + /* Allowed if parent directory not sticky and world-writable. */ + parent = nd->path.dentry->d_inode; + if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) + return 0; + + /* Allowed if parent directory and link owner match. */ + if (parent->i_uid == inode->i_uid) + return 0; + + path_put_conditional(link, nd); + path_put(&nd->path); + audit_log_link_denied("follow_link", link); + return -EACCES; +} + +/** + * safe_hardlink_source - Check for safe hardlink conditions + * @inode: the source inode to hardlink from + * + * Return false if at least one of the following conditions: + * - inode is not a regular file + * - inode is setuid + * - inode is setgid and group-exec + * - access failure for read and write + * + * Otherwise returns true. + */ +static bool safe_hardlink_source(struct inode *inode) +{ + umode_t mode = inode->i_mode; + + /* Special files should not get pinned to the filesystem. */ + if (!S_ISREG(mode)) + return false; + + /* Setuid files should not get pinned to the filesystem. */ + if (mode & S_ISUID) + return false; + + /* Executable setgid files should not get pinned to the filesystem. */ + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + return false; + + /* Hardlinking to unreadable or unwritable sources is dangerous. */ + if (inode_permission(inode, MAY_READ | MAY_WRITE)) + return false; + + return true; +} + +/** + * may_linkat - Check permissions for creating a hardlink + * @link: the source to hardlink from + * + * Block hardlink when all of: + * - sysctl_protected_hardlinks enabled + * - fsuid does not match inode + * - hardlink source is unsafe (see safe_hardlink_source() above) + * - not CAP_FOWNER + * + * Returns 0 if successful, -ve on error. + */ +static int may_linkat(struct path *link) +{ + const struct cred *cred; + struct inode *inode; + + if (!sysctl_protected_hardlinks) + return 0; + + cred = current_cred(); + inode = link->dentry->d_inode; + + /* Source inode owner (or CAP_FOWNER) can hardlink all they like, + * otherwise, it must be a safe source. + */ + if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) || + capable(CAP_FOWNER)) + return 0; + + audit_log_link_denied("linkat", link); + return -EPERM; +} + static __always_inline int follow_link(struct path *link, struct nameidata *nd, void **p) { @@ -1818,6 +1933,9 @@ static int path_lookupat(int dfd, const char *name, while (err > 0) { void *cookie; struct path link = path; + err = may_follow_link(&link, nd); + if (unlikely(err)) + break; nd->flags |= LOOKUP_PARENT; err = follow_link(&link, nd, &cookie); if (err) @@ -2277,7 +2395,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode) static int atomic_open(struct nameidata *nd, struct dentry *dentry, struct path *path, struct file *file, const struct open_flags *op, - bool *want_write, bool need_lookup, + bool got_write, bool need_lookup, int *opened) { struct inode *dir = nd->path.dentry->d_inode; @@ -2300,7 +2418,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, if ((open_flag & O_CREAT) && !IS_POSIXACL(dir)) mode &= ~current_umask(); - if (open_flag & O_EXCL) { + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) { open_flag &= ~O_TRUNC; *opened |= FILE_CREATED; } @@ -2314,12 +2432,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ - if ((open_flag & (O_CREAT | O_TRUNC)) || - (open_flag & O_ACCMODE) != O_RDONLY) { - error = mnt_want_write(nd->path.mnt); - if (!error) { - *want_write = true; - } else if (!(open_flag & O_CREAT)) { + if (((open_flag & (O_CREAT | O_TRUNC)) || + (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) { + if (!(open_flag & O_CREAT)) { /* * No O_CREATE -> atomicity not a requirement -> fall * back to lookup + open @@ -2327,11 +2442,11 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, goto no_open; } else if (open_flag & (O_EXCL | O_TRUNC)) { /* Fall back and fail with the right error */ - create_error = error; + create_error = -EROFS; goto no_open; } else { /* No side effects, safe to clear O_CREAT */ - create_error = error; + create_error = -EROFS; open_flag &= ~O_CREAT; } } @@ -2438,7 +2553,7 @@ looked_up: static int lookup_open(struct nameidata *nd, struct path *path, struct file *file, const struct open_flags *op, - bool *want_write, int *opened) + bool got_write, int *opened) { struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; @@ -2456,7 +2571,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, goto out_no_open; if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) { - return atomic_open(nd, dentry, path, file, op, want_write, + return atomic_open(nd, dentry, path, file, op, got_write, need_lookup, opened); } @@ -2480,10 +2595,10 @@ static int lookup_open(struct nameidata *nd, struct path *path, * a permanent write count is taken through * the 'struct file' in finish_open(). */ - error = mnt_want_write(nd->path.mnt); - if (error) + if (!got_write) { + error = -EROFS; goto out_dput; - *want_write = true; + } *opened |= FILE_CREATED; error = security_path_mknod(&nd->path, dentry, mode, 0); if (error) @@ -2513,7 +2628,7 @@ static int do_last(struct nameidata *nd, struct path *path, struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; - bool want_write = false; + bool got_write = false; int acc_mode = op->acc_mode; struct inode *inode; bool symlink_ok = false; @@ -2582,8 +2697,18 @@ static int do_last(struct nameidata *nd, struct path *path, } retry_lookup: + if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { + error = mnt_want_write(nd->path.mnt); + if (!error) + got_write = true; + /* + * do _not_ fail yet - we might not need that or fail with + * a different error; let lookup_open() decide; we'll be + * dropping this one anyway. + */ + } mutex_lock(&dir->d_inode->i_mutex); - error = lookup_open(nd, path, file, op, &want_write, opened); + error = lookup_open(nd, path, file, op, got_write, opened); mutex_unlock(&dir->d_inode->i_mutex); if (error <= 0) { @@ -2608,22 +2733,23 @@ retry_lookup: } /* - * It already exists. + * create/update audit record if it already exists. */ - audit_inode(pathname, path->dentry); + if (path->dentry->d_inode) + audit_inode(pathname, path->dentry); /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if * necessary...) */ - if (want_write) { + if (got_write) { mnt_drop_write(nd->path.mnt); - want_write = false; + got_write = false; } error = -EEXIST; - if (open_flag & O_EXCL) + if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) goto exit_dput; error = follow_managed(path, nd->flags); @@ -2684,7 +2810,7 @@ finish_open: error = mnt_want_write(nd->path.mnt); if (error) goto out; - want_write = true; + got_write = true; } finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); @@ -2711,7 +2837,7 @@ opened: goto exit_fput; } out: - if (want_write) + if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); terminate_walk(nd); @@ -2735,9 +2861,9 @@ stale_open: nd->inode = dir->d_inode; save_parent.mnt = NULL; save_parent.dentry = NULL; - if (want_write) { + if (got_write) { mnt_drop_write(nd->path.mnt); - want_write = false; + got_write = false; } retried = true; goto retry_lookup; @@ -2777,6 +2903,9 @@ static struct file *path_openat(int dfd, const char *pathname, error = -ELOOP; break; } + error = may_follow_link(&link, nd); + if (unlikely(error)) + break; nd->flags |= LOOKUP_PARENT; nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); error = follow_link(&link, nd, &cookie); @@ -2846,6 +2975,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path { struct dentry *dentry = ERR_PTR(-EEXIST); struct nameidata nd; + int err2; int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); @@ -2859,16 +2989,19 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path nd.flags &= ~LOOKUP_PARENT; nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; + /* don't fail immediately if it's r/o, at least try to report other errors */ + err2 = mnt_want_write(nd.path.mnt); /* * Do the final lookup. */ mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); if (IS_ERR(dentry)) - goto fail; + goto unlock; + error = -EEXIST; if (dentry->d_inode) - goto eexist; + goto fail; /* * Special case - lookup gave negative, but... we had foo/bar/ * From the vfs_mknod() POV we just have a negative dentry - @@ -2876,23 +3009,37 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path * been asking for (non-existent) directory. -ENOENT for you. */ if (unlikely(!is_dir && nd.last.name[nd.last.len])) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); + error = -ENOENT; + goto fail; + } + if (unlikely(err2)) { + error = err2; goto fail; } *path = nd.path; return dentry; -eexist: - dput(dentry); - dentry = ERR_PTR(-EEXIST); fail: + dput(dentry); + dentry = ERR_PTR(error); +unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + if (!err2) + mnt_drop_write(nd.path.mnt); out: path_put(&nd.path); return dentry; } EXPORT_SYMBOL(kern_path_create); +void done_path_create(struct path *path, struct dentry *dentry) +{ + dput(dentry); + mutex_unlock(&path->dentry->d_inode->i_mutex); + mnt_drop_write(path->mnt); + path_put(path); +} +EXPORT_SYMBOL(done_path_create); + struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) { char *tmp = getname(pathname); @@ -2956,8 +3103,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, struct path path; int error; - if (S_ISDIR(mode)) - return -EPERM; + error = may_mknod(mode); + if (error) + return error; dentry = user_path_create(dfd, filename, &path, 0); if (IS_ERR(dentry)) @@ -2965,15 +3113,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = may_mknod(mode); - if (error) - goto out_dput; - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mknod(&path, dentry, mode, dev); if (error) - goto out_drop_write; + goto out; switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(path.dentry->d_inode,dentry,mode,true); @@ -2986,13 +3128,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); break; } -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); - +out: + done_path_create(&path, dentry); return error; } @@ -3038,19 +3175,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_mkdir(&path, dentry, mode); - if (error) - goto out_drop_write; - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + if (!error) + error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + done_path_create(&path, dentry); return error; } @@ -3144,6 +3272,9 @@ static long do_rmdir(int dfd, const char __user *pathname) } nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); @@ -3154,19 +3285,15 @@ static long do_rmdir(int dfd, const char __user *pathname) error = -ENOENT; goto exit3; } - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit3; error = security_path_rmdir(&nd.path, dentry); if (error) - goto exit4; + goto exit3; error = vfs_rmdir(nd.path.dentry->d_inode, dentry); -exit4: - mnt_drop_write(nd.path.mnt); exit3: dput(dentry); exit2: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); putname(name); @@ -3233,6 +3360,9 @@ static long do_unlinkat(int dfd, const char __user *pathname) goto exit1; nd.flags &= ~LOOKUP_PARENT; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit1; mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); dentry = lookup_hash(&nd); @@ -3245,21 +3375,17 @@ static long do_unlinkat(int dfd, const char __user *pathname) if (!inode) goto slashes; ihold(inode); - error = mnt_want_write(nd.path.mnt); - if (error) - goto exit2; error = security_path_unlink(&nd.path, dentry); if (error) - goto exit3; + goto exit2; error = vfs_unlink(nd.path.dentry->d_inode, dentry); -exit3: - mnt_drop_write(nd.path.mnt); - exit2: +exit2: dput(dentry); } mutex_unlock(&nd.path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ + mnt_drop_write(nd.path.mnt); exit1: path_put(&nd.path); putname(name); @@ -3324,19 +3450,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, if (IS_ERR(dentry)) goto out_putname; - error = mnt_want_write(path.mnt); - if (error) - goto out_dput; error = security_path_symlink(&path, dentry, from); - if (error) - goto out_drop_write; - error = vfs_symlink(path.dentry->d_inode, dentry, from); -out_drop_write: - mnt_drop_write(path.mnt); -out_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); + if (!error) + error = vfs_symlink(path.dentry->d_inode, dentry, from); + done_path_create(&path, dentry); out_putname: putname(from); return error; @@ -3436,19 +3553,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = mnt_want_write(new_path.mnt); - if (error) + error = may_linkat(&old_path); + if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) - goto out_drop_write; + goto out_dput; error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); -out_drop_write: - mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); @@ -3644,6 +3757,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, if (newnd.last_type != LAST_NORM) goto exit2; + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit2; + oldnd.flags &= ~LOOKUP_PARENT; newnd.flags &= ~LOOKUP_PARENT; newnd.flags |= LOOKUP_RENAME_TARGET; @@ -3679,23 +3796,19 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, if (new_dentry == trap) goto exit5; - error = mnt_want_write(oldnd.path.mnt); - if (error) - goto exit5; error = security_path_rename(&oldnd.path, old_dentry, &newnd.path, new_dentry); if (error) - goto exit6; + goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); -exit6: - mnt_drop_write(oldnd.path.mnt); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_dir, old_dir); + mnt_drop_write(oldnd.path.mnt); exit2: path_put(&newnd.path); putname(to); diff --git a/fs/namespace.c b/fs/namespace.c index c53d3381b0d0..4d31f73e2561 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -283,24 +283,22 @@ static int mnt_is_readonly(struct vfsmount *mnt) } /* - * Most r/o checks on a fs are for operations that take - * discrete amounts of time, like a write() or unlink(). - * We must keep track of when those operations start - * (for permission checks) and when they end, so that - * we can determine when writes are able to occur to - * a filesystem. + * Most r/o & frozen checks on a fs are for operations that take discrete + * amounts of time, like a write() or unlink(). We must keep track of when + * those operations start (for permission checks) and when they end, so that we + * can determine when writes are able to occur to a filesystem. */ /** - * mnt_want_write - get write access to a mount + * __mnt_want_write - get write access to a mount without freeze protection * @m: the mount on which to take a write * - * This tells the low-level filesystem that a write is - * about to be performed to it, and makes sure that - * writes are allowed before returning success. When - * the write operation is finished, mnt_drop_write() - * must be called. This is effectively a refcount. + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mnt it read-write) before + * returning success. This operation does not protect against filesystem being + * frozen. When the write operation is finished, __mnt_drop_write() must be + * called. This is effectively a refcount. */ -int mnt_want_write(struct vfsmount *m) +int __mnt_want_write(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -326,6 +324,27 @@ int mnt_want_write(struct vfsmount *m) ret = -EROFS; } preempt_enable(); + + return ret; +} + +/** + * mnt_want_write - get write access to a mount + * @m: the mount on which to take a write + * + * This tells the low-level filesystem that a write is about to be performed to + * it, and makes sure that writes are allowed (mount is read-write, filesystem + * is not frozen) before returning success. When the write operation is + * finished, mnt_drop_write() must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *m) +{ + int ret; + + sb_start_write(m->mnt_sb); + ret = __mnt_want_write(m); + if (ret) + sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); @@ -355,38 +374,76 @@ int mnt_clone_write(struct vfsmount *mnt) EXPORT_SYMBOL_GPL(mnt_clone_write); /** - * mnt_want_write_file - get write access to a file's mount + * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like mnt_want_write, but it takes a file and can + * This is like __mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already */ -int mnt_want_write_file(struct file *file) +int __mnt_want_write_file(struct file *file) { struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) - return mnt_want_write(file->f_path.mnt); + return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); } + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + sb_start_write(file->f_path.mnt->mnt_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file->f_path.mnt->mnt_sb); + return ret; +} EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * mnt_drop_write - give up write access to a mount + * __mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * mnt_want_write() call above. + * __mnt_want_write() call above. */ -void mnt_drop_write(struct vfsmount *mnt) +void __mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done performing writes to it and + * also allows filesystem to be frozen again. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + __mnt_drop_write(mnt); + sb_end_write(mnt->mnt_sb); +} EXPORT_SYMBOL_GPL(mnt_drop_write); +void __mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); +} + void mnt_drop_write_file(struct file *file) { mnt_drop_write(file->f_path.mnt); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 5ff0b7b9fc08..43295d45cc2b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -154,6 +154,10 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) if (status < 0) return; + status = mnt_want_write_file(rec_file); + if (status) + return; + dir = rec_file->f_path.dentry; /* lock the parent */ mutex_lock(&dir->d_inode->i_mutex); @@ -173,11 +177,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = mnt_want_write_file(rec_file); - if (status) - goto out_put; status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); - mnt_drop_write_file(rec_file); out_put: dput(dentry); out_unlock: @@ -189,6 +189,7 @@ out_unlock: " (err %d); please check that %s exists" " and is writeable", status, user_recovery_dirname); + mnt_drop_write_file(rec_file); nfs4_reset_creds(original_cred); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index cc793005a87c..032af381b3aa 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -635,6 +635,7 @@ fh_put(struct svc_fh *fhp) fhp->fh_post_saved = 0; #endif } + fh_drop_write(fhp); if (exp) { exp_put(exp); fhp->fh_export = NULL; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index e15dc45fc5ec..aad6d457b9e8 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -196,6 +196,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, struct dentry *dchild; int type, mode; __be32 nfserr; + int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); dprintk("nfsd: CREATE %s %.*s\n", @@ -214,6 +215,12 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, nfserr = nfserr_exist; if (isdotent(argp->name, argp->len)) goto done; + hosterr = fh_want_write(dirfhp); + if (hosterr) { + nfserr = nfserrno(hosterr); + goto done; + } + fh_lock_nested(dirfhp, I_MUTEX_PARENT); dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); if (IS_ERR(dchild)) { @@ -330,7 +337,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, out_unlock: /* We don't really need to unlock, as fh_put does it. */ fh_unlock(dirfhp); - + fh_drop_write(dirfhp); done: fh_put(dirfhp); return nfsd_return_dirop(nfserr, resp); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 702f64e820c3..a9269f142cc4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1284,6 +1284,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * If it has, the parent directory should already be locked. */ if (!resfhp->fh_dentry) { + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ fh_lock_nested(fhp, I_MUTEX_PARENT); dchild = lookup_one_len(fname, dentry, flen); @@ -1327,14 +1331,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - /* * Get the dir op function pointer. */ err = 0; + host_err = 0; switch (type) { case S_IFREG: host_err = vfs_create(dirp, dchild, iap->ia_mode, true); @@ -1351,10 +1352,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; } - if (host_err < 0) { - fh_drop_write(fhp); + if (host_err < 0) goto out_nfserr; - } err = nfsd_create_setattr(rqstp, resfhp, iap); @@ -1366,7 +1365,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = nfserrno(commit_metadata(fhp)); if (err2) err = err2; - fh_drop_write(fhp); /* * Update the file handle to get the new inode info. */ @@ -1425,6 +1423,11 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notdir; if (!dirp->i_op->lookup) goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock_nested(fhp, I_MUTEX_PARENT); /* @@ -1457,9 +1460,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; if (dchild->d_inode) { err = 0; @@ -1530,7 +1530,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!err) err = nfserrno(commit_metadata(fhp)); - fh_drop_write(fhp); /* * Update the filehandle to get the new inode info. */ @@ -1541,6 +1540,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, fh_unlock(fhp); if (dchild && !IS_ERR(dchild)) dput(dchild); + fh_drop_write(fhp); return err; out_nfserr: @@ -1621,6 +1621,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; + + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock(fhp); dentry = fhp->fh_dentry; dnew = lookup_one_len(fname, dentry, flen); @@ -1628,10 +1633,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = fh_want_write(fhp); - if (host_err) - goto out_nfserr; - if (unlikely(path[plen] != 0)) { char *path_alloced = kmalloc(plen+1, GFP_KERNEL); if (path_alloced == NULL) @@ -1691,6 +1692,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; + host_err = fh_want_write(tfhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; dirp = ddir->d_inode; @@ -1702,18 +1709,13 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; - host_err = fh_want_write(tfhp); - if (host_err) { - err = nfserrno(host_err); - goto out_dput; - } err = nfserr_noent; if (!dold->d_inode) - goto out_drop_write; + goto out_dput; host_err = nfsd_break_lease(dold->d_inode); if (host_err) { err = nfserrno(host_err); - goto out_drop_write; + goto out_dput; } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { @@ -1726,12 +1728,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, else err = nfserrno(host_err); } -out_drop_write: - fh_drop_write(tfhp); out_dput: dput(dnew); out_unlock: fh_unlock(ffhp); + fh_drop_write(tfhp); out: return err; @@ -1774,6 +1775,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; + host_err = fh_want_write(ffhp); + if (host_err) { + err = nfserrno(host_err); + goto out; + } + /* cannot use fh_lock as we need deadlock protective ordering * so do it by hand */ trap = lock_rename(tdentry, fdentry); @@ -1804,17 +1811,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = -EXDEV; if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) goto out_dput_new; - host_err = fh_want_write(ffhp); - if (host_err) - goto out_dput_new; host_err = nfsd_break_lease(odentry->d_inode); if (host_err) - goto out_drop_write; + goto out_dput_new; if (ndentry->d_inode) { host_err = nfsd_break_lease(ndentry->d_inode); if (host_err) - goto out_drop_write; + goto out_dput_new; } host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err) { @@ -1822,8 +1826,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!host_err) host_err = commit_metadata(ffhp); } -out_drop_write: - fh_drop_write(ffhp); out_dput_new: dput(ndentry); out_dput_old: @@ -1839,6 +1841,7 @@ out_drop_write: fill_post_wcc(tfhp); unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = 0; + fh_drop_write(ffhp); out: return err; @@ -1864,6 +1867,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (err) goto out; + host_err = fh_want_write(fhp); + if (host_err) + goto out_nfserr; + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; dirp = dentry->d_inode; @@ -1882,21 +1889,15 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; - host_err = fh_want_write(fhp); - if (host_err) - goto out_put; - host_err = nfsd_break_lease(rdentry->d_inode); if (host_err) - goto out_drop_write; + goto out_put; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry); else host_err = vfs_rmdir(dirp, rdentry); if (!host_err) host_err = commit_metadata(fhp); -out_drop_write: - fh_drop_write(fhp); out_put: dput(rdentry); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index ec0611b2b738..359594c393d2 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -110,12 +110,19 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); static inline int fh_want_write(struct svc_fh *fh) { - return mnt_want_write(fh->fh_export->ex_path.mnt); + int ret = mnt_want_write(fh->fh_export->ex_path.mnt); + + if (!ret) + fh->fh_want_write = 1; + return ret; } static inline void fh_drop_write(struct svc_fh *fh) { - mnt_drop_write(fh->fh_export->ex_path.mnt); + if (fh->fh_want_write) { + fh->fh_want_write = 0; + mnt_drop_write(fh->fh_export->ex_path.mnt); + } } #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 62cebc8e1a1f..a4d56ac02e6c 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -69,16 +69,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_dentry->d_inode; struct nilfs_transaction_info ti; - int ret; + int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ + sb_start_pagefault(inode->i_sb); lock_page(page); if (page->mapping != inode->i_mapping || page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); - return VM_FAULT_NOPAGE; /* make the VM retry the fault */ + ret = -EFAULT; /* make the VM retry the fault */ + goto out; } /* @@ -112,19 +114,21 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); /* never returns -ENOMEM, but may return -ENOSPC */ if (unlikely(ret)) - return VM_FAULT_SIGBUS; + goto out; - ret = block_page_mkwrite(vma, vmf, nilfs_get_block); - if (ret != VM_FAULT_LOCKED) { + ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); + if (ret) { nilfs_transaction_abort(inode->i_sb); - return ret; + goto out; } nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: wait_on_page_writeback(page); - return VM_FAULT_LOCKED; + out: + sb_end_pagefault(inode->i_sb); + return block_page_mkwrite_return(ret); } static const struct vm_operations_struct nilfs_file_vm_ops = { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 0b6387c67e6c..fdb180769485 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -660,8 +660,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, goto out_free; } - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]); if (ret < 0) printk(KERN_ERR "NILFS: GC failed during preparation: " diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 88e11fb346b6..a5752a589932 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -189,7 +189,7 @@ int nilfs_transaction_begin(struct super_block *sb, if (ret > 0) return 0; - vfs_check_frozen(sb, SB_FREEZE_WRITE); + sb_start_intwrite(sb); nilfs = sb->s_fs_info; down_read(&nilfs->ns_segctor_sem); @@ -205,6 +205,7 @@ int nilfs_transaction_begin(struct super_block *sb, current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); return ret; } @@ -246,6 +247,7 @@ int nilfs_transaction_commit(struct super_block *sb) err = nilfs_construct_segment(sb); if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); return err; } @@ -264,6 +266,7 @@ void nilfs_transaction_abort(struct super_block *sb) current->journal_info = ti->ti_save; if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) kmem_cache_free(nilfs_transaction_cachep, ti); + sb_end_intwrite(sb); } void nilfs_relax_pressure_in_lock(struct super_block *sb) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7389d2d5e51d..1ecf46448f85 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2084,7 +2084,6 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, if (err) return err; pos = *ppos; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); /* We can write back this queue in page reclaim. */ current->backing_dev_info = mapping->backing_dev_info; written = 0; @@ -2119,6 +2118,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2127,6 +2127,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0) ret = err; } + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 7602783d7f41..46a1f6d75104 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1971,6 +1971,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, { struct inode *inode = file->f_path.dentry->d_inode; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret; if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && !ocfs2_writes_unwritten_extents(osb)) @@ -1985,7 +1986,12 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, if (!(file->f_mode & FMODE_WRITE)) return -EBADF; - return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); + ret = mnt_want_write_file(file); + if (ret) + return ret; + ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); + mnt_drop_write_file(file); + return ret; } static long ocfs2_fallocate(struct file *file, int mode, loff_t offset, @@ -2261,7 +2267,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (iocb->ki_left == 0) return 0; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); appending = file->f_flags & O_APPEND ? 1 : 0; direct_io = file->f_flags & O_DIRECT ? 1 : 0; @@ -2436,6 +2442,7 @@ out_sems: ocfs2_iocb_clear_sem_locked(iocb); mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); if (written) ret = written; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d96f7f81d8dd..f20edcbfe700 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -928,7 +928,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(new_clusters, (int __user *)arg)) return -EFAULT; - return ocfs2_group_extend(inode, new_clusters); + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_extend(inode, new_clusters); + mnt_drop_write_file(filp); + return status; case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: if (!capable(CAP_SYS_RESOURCE)) @@ -937,7 +942,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (copy_from_user(&input, (int __user *) arg, sizeof(input))) return -EFAULT; - return ocfs2_group_add(inode, &input); + status = mnt_want_write_file(filp); + if (status) + return status; + status = ocfs2_group_add(inode, &input); + mnt_drop_write_file(filp); + return status; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 0a42ae96dca7..2dd36af79e26 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -355,11 +355,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) if (journal_current_handle()) return jbd2_journal_start(journal, max_buffs); + sb_start_intwrite(osb->sb); + down_read(&osb->journal->j_trans_barrier); handle = jbd2_journal_start(journal, max_buffs); if (IS_ERR(handle)) { up_read(&osb->journal->j_trans_barrier); + sb_end_intwrite(osb->sb); mlog_errno(PTR_ERR(handle)); @@ -388,8 +391,10 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, if (ret < 0) mlog_errno(ret); - if (!nested) + if (!nested) { up_read(&journal->j_trans_barrier); + sb_end_intwrite(osb->sb); + } return ret; } diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9cd41083e991..d150372fd81d 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -136,6 +136,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sigset_t oldset; int ret; + sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); /* @@ -165,6 +166,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out: ocfs2_unblock_signals(&oldset); + sb_end_pagefault(inode->i_sb); return ret; } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9f32d7cbb7a3..30a055049e16 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4466,20 +4466,11 @@ int ocfs2_reflink_ioctl(struct inode *inode, goto out_dput; } - error = mnt_want_write(new_path.mnt); - if (error) { - mlog_errno(error); - goto out_dput; - } - error = ocfs2_vfs_reflink(old_path.dentry, new_path.dentry->d_inode, new_dentry, preserve); - mnt_drop_write(new_path.mnt); out_dput: - dput(new_dentry); - mutex_unlock(&new_path.dentry->d_inode->i_mutex); - path_put(&new_path); + done_path_create(&new_path, new_dentry); out: path_put(&old_path); diff --git a/fs/open.c b/fs/open.c index 1e914b397e12..f3d96e7e7b19 100644 --- a/fs/open.c +++ b/fs/open.c @@ -164,11 +164,13 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(inode)) goto out_putf; + sb_start_write(inode->i_sb); error = locks_verify_truncate(inode, file, length); if (!error) error = security_path_truncate(&file->f_path); if (!error) error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file); + sb_end_write(inode->i_sb); out_putf: fput(file); out: @@ -266,7 +268,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!file->f_op->fallocate) return -EOPNOTSUPP; - return file->f_op->fallocate(file, mode, offset, len); + sb_start_write(inode->i_sb); + ret = file->f_op->fallocate(file, mode, offset, len); + sb_end_write(inode->i_sb); + return ret; } SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len) @@ -620,7 +625,7 @@ static inline int __get_file_write_access(struct inode *inode, /* * Balanced in __fput() */ - error = mnt_want_write(mnt); + error = __mnt_want_write(mnt); if (error) put_write_access(inode); } @@ -654,6 +659,7 @@ static int do_dentry_open(struct file *f, if (unlikely(f->f_flags & O_PATH)) f->f_mode = FMODE_PATH; + path_get(&f->f_path); inode = f->f_path.dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, f->f_path.mnt); @@ -739,9 +745,7 @@ int finish_open(struct file *file, struct dentry *dentry, int error; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ - mntget(file->f_path.mnt); - file->f_path.dentry = dget(dentry); - + file->f_path.dentry = dentry; error = do_dentry_open(file, open, current_cred()); if (!error) *opened |= FILE_OPENED; @@ -784,7 +788,6 @@ struct file *dentry_open(const struct path *path, int flags, f->f_flags = flags; f->f_path = *path; - path_get(&f->f_path); error = do_dentry_open(f, NULL, cred); if (!error) { error = open_check_o_direct(f); diff --git a/fs/pipe.c b/fs/pipe.c index 95cbd6b227e6..8d85d7068c1e 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1016,18 +1016,16 @@ fail_inode: return NULL; } -struct file *create_write_pipe(int flags) +int create_pipe_files(struct file **res, int flags) { int err; - struct inode *inode; + struct inode *inode = get_pipe_inode(); struct file *f; struct path path; - struct qstr name = { .name = "" }; + static struct qstr name = { .name = "" }; - err = -ENFILE; - inode = get_pipe_inode(); if (!inode) - goto err; + return -ENFILE; err = -ENOMEM; path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); @@ -1041,62 +1039,43 @@ struct file *create_write_pipe(int flags) f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); if (!f) goto err_dentry; - f->f_mapping = inode->i_mapping; f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); - f->f_version = 0; - return f; + res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); + if (!res[0]) + goto err_file; + + path_get(&path); + res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); + res[1] = f; + return 0; - err_dentry: +err_file: + put_filp(f); +err_dentry: free_pipe_info(inode); path_put(&path); - return ERR_PTR(err); + return err; - err_inode: +err_inode: free_pipe_info(inode); iput(inode); - err: - return ERR_PTR(err); -} - -void free_write_pipe(struct file *f) -{ - free_pipe_info(f->f_dentry->d_inode); - path_put(&f->f_path); - put_filp(f); -} - -struct file *create_read_pipe(struct file *wrf, int flags) -{ - /* Grab pipe from the writer */ - struct file *f = alloc_file(&wrf->f_path, FMODE_READ, - &read_pipefifo_fops); - if (!f) - return ERR_PTR(-ENFILE); - - path_get(&wrf->f_path); - f->f_flags = O_RDONLY | (flags & O_NONBLOCK); - - return f; + return err; } int do_pipe_flags(int *fd, int flags) { - struct file *fw, *fr; + struct file *files[2]; int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) return -EINVAL; - fw = create_write_pipe(flags); - if (IS_ERR(fw)) - return PTR_ERR(fw); - fr = create_read_pipe(fw, flags); - error = PTR_ERR(fr); - if (IS_ERR(fr)) - goto err_write_pipe; + error = create_pipe_files(files, flags); + if (error) + return error; error = get_unused_fd_flags(flags); if (error < 0) @@ -1109,8 +1088,8 @@ int do_pipe_flags(int *fd, int flags) fdw = error; audit_fd_pair(fdr, fdw); - fd_install(fdr, fr); - fd_install(fdw, fw); + fd_install(fdr, files[0]); + fd_install(fdw, files[1]); fd[0] = fdr; fd[1] = fdw; @@ -1119,10 +1098,8 @@ int do_pipe_flags(int *fd, int flags) err_fdr: put_unused_fd(fdr); err_read_pipe: - path_put(&fr->f_path); - put_filp(fr); - err_write_pipe: - free_write_pipe(fw); + fput(files[0]); + fput(files[1]); return error; } diff --git a/fs/splice.c b/fs/splice.c index 7bf08fa22ec9..41514dd89462 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -996,6 +996,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; + sb_start_write(inode->i_sb); + pipe_lock(pipe); splice_from_pipe_begin(&sd); @@ -1034,6 +1036,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/super.c b/fs/super.c index 4bf714459a4b..b05cf47463d0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -33,12 +33,19 @@ #include <linux/rculist_bl.h> #include <linux/cleancache.h> #include <linux/fsnotify.h> +#include <linux/lockdep.h> #include "internal.h" LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); +static char *sb_writers_name[SB_FREEZE_LEVELS] = { + "sb_writers", + "sb_pagefaults", + "sb_internal", +}; + /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. @@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return total_objects; } +static int init_sb_writers(struct super_block *s, struct file_system_type *type) +{ + int err; + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) { + err = percpu_counter_init(&s->s_writers.counter[i], 0); + if (err < 0) + goto err_out; + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], + &type->s_writers_key[i], 0); + } + init_waitqueue_head(&s->s_writers.wait); + init_waitqueue_head(&s->s_writers.wait_unfrozen); + return 0; +err_out: + while (--i >= 0) + percpu_counter_destroy(&s->s_writers.counter[i]); + return err; +} + +static void destroy_sb_writers(struct super_block *s) +{ + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_counter_destroy(&s->s_writers.counter[i]); +} + /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to @@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) if (s) { if (security_sb_alloc(s)) { + /* + * We cannot call security_sb_free() without + * security_sb_alloc() succeeding. So bail out manually + */ kfree(s); s = NULL; goto out; } #ifdef CONFIG_SMP s->s_files = alloc_percpu(struct list_head); - if (!s->s_files) { - security_sb_free(s); - kfree(s); - s = NULL; - goto out; - } else { + if (!s->s_files) + goto err_out; + else { int i; for_each_possible_cpu(i) @@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) #else INIT_LIST_HEAD(&s->s_files); #endif + if (init_sb_writers(s, type)) + goto err_out; s->s_flags = flags; s->s_bdi = &default_backing_dev_info; INIT_HLIST_NODE(&s->s_instances); @@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) mutex_init(&s->s_dquot.dqio_mutex); mutex_init(&s->s_dquot.dqonoff_mutex); init_rwsem(&s->s_dquot.dqptr_sem); - init_waitqueue_head(&s->s_wait_unfrozen); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; @@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) } out: return s; +err_out: + security_sb_free(s); +#ifdef CONFIG_SMP + if (s->s_files) + free_percpu(s->s_files); +#endif + destroy_sb_writers(s); + kfree(s); + s = NULL; + goto out; } /** @@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s) #ifdef CONFIG_SMP free_percpu(s->s_files); #endif + destroy_sb_writers(s); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); @@ -651,10 +700,11 @@ struct super_block *get_super_thawed(struct block_device *bdev) { while (1) { struct super_block *s = get_super(bdev); - if (!s || s->s_frozen == SB_UNFROZEN) + if (!s || s->s_writers.frozen == SB_UNFROZEN) return s; up_read(&s->s_umount); - vfs_check_frozen(s, SB_FREEZE_WRITE); + wait_event(s->s_writers.wait_unfrozen, + s->s_writers.frozen == SB_UNFROZEN); put_super(s); } } @@ -732,7 +782,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) int retval; int remount_ro; - if (sb->s_frozen != SB_UNFROZEN) + if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; #ifdef CONFIG_BLOCK @@ -1163,6 +1213,120 @@ out: return ERR_PTR(error); } +/* + * This is an internal function, please use sb_end_{write,pagefault,intwrite} + * instead. + */ +void __sb_end_write(struct super_block *sb, int level) +{ + percpu_counter_dec(&sb->s_writers.counter[level-1]); + /* + * Make sure s_writers are updated before we wake up waiters in + * freeze_super(). + */ + smp_mb(); + if (waitqueue_active(&sb->s_writers.wait)) + wake_up(&sb->s_writers.wait); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); +} +EXPORT_SYMBOL(__sb_end_write); + +#ifdef CONFIG_LOCKDEP +/* + * We want lockdep to tell us about possible deadlocks with freezing but + * it's it bit tricky to properly instrument it. Getting a freeze protection + * works as getting a read lock but there are subtle problems. XFS for example + * gets freeze protection on internal level twice in some cases, which is OK + * only because we already hold a freeze protection also on higher level. Due + * to these cases we have to tell lockdep we are doing trylock when we + * already hold a freeze protection for a higher freeze level. + */ +static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, + unsigned long ip) +{ + int i; + + if (!trylock) { + for (i = 0; i < level - 1; i++) + if (lock_is_held(&sb->s_writers.lock_map[i])) { + trylock = true; + break; + } + } + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); +} +#endif + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ +retry: + if (unlikely(sb->s_writers.frozen >= level)) { + if (!wait) + return 0; + wait_event(sb->s_writers.wait_unfrozen, + sb->s_writers.frozen < level); + } + +#ifdef CONFIG_LOCKDEP + acquire_freeze_lock(sb, level, !wait, _RET_IP_); +#endif + percpu_counter_inc(&sb->s_writers.counter[level-1]); + /* + * Make sure counter is updated before we check for frozen. + * freeze_super() first sets frozen and then checks the counter. + */ + smp_mb(); + if (unlikely(sb->s_writers.frozen >= level)) { + __sb_end_write(sb, level); + goto retry; + } + return 1; +} +EXPORT_SYMBOL(__sb_start_write); + +/** + * sb_wait_write - wait until all writers to given file system finish + * @sb: the super for which we wait + * @level: type of writers we wait for (normal vs page fault) + * + * This function waits until there are no writers of given type to given file + * system. Caller of this function should make sure there can be no new writers + * of type @level before calling this function. Otherwise this function can + * livelock. + */ +static void sb_wait_write(struct super_block *sb, int level) +{ + s64 writers; + + /* + * We just cycle-through lockdep here so that it does not complain + * about returning with lock to userspace + */ + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + + do { + DEFINE_WAIT(wait); + + /* + * We use a barrier in prepare_to_wait() to separate setting + * of frozen and checking of the counter + */ + prepare_to_wait(&sb->s_writers.wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); + if (writers) + schedule(); + + finish_wait(&sb->s_writers.wait, &wait); + } while (writers); +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock @@ -1170,6 +1334,31 @@ out: * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs will return * -EBUSY. + * + * During this function, sb->s_writers.frozen goes through these values: + * + * SB_UNFROZEN: File system is normal, all writes progress as usual. + * + * SB_FREEZE_WRITE: The file system is in the process of being frozen. New + * writes should be blocked, though page faults are still allowed. We wait for + * all writes to complete and then proceed to the next stage. + * + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked + * but internal fs threads can still modify the filesystem (although they + * should not dirty new pages or inodes), writeback can run etc. After waiting + * for all running page faults we sync the filesystem which will clean all + * dirty pages and inodes (no new dirty pages or inodes can be created when + * sync is running). + * + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs + * modification are blocked (e.g. XFS preallocation truncation on inode + * reclaim). This is usually implemented by blocking new transactions for + * filesystems that have them and need this additional guard. After all + * internal writers are finished we call ->freeze_fs() to finish filesystem + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is + * mostly auxiliary for filesystems to verify they do not modify frozen fs. + * + * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb) { @@ -1177,7 +1366,7 @@ int freeze_super(struct super_block *sb) atomic_inc(&sb->s_active); down_write(&sb->s_umount); - if (sb->s_frozen) { + if (sb->s_writers.frozen != SB_UNFROZEN) { deactivate_locked_super(sb); return -EBUSY; } @@ -1188,33 +1377,53 @@ int freeze_super(struct super_block *sb) } if (sb->s_flags & MS_RDONLY) { - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); + /* Nothing to do really... */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } - sb->s_frozen = SB_FREEZE_WRITE; + /* From now on, no new normal writers can start */ + sb->s_writers.frozen = SB_FREEZE_WRITE; + smp_wmb(); + + /* Release s_umount to preserve sb_start_write -> s_umount ordering */ + up_write(&sb->s_umount); + + sb_wait_write(sb, SB_FREEZE_WRITE); + + /* Now we go and block page faults... */ + down_write(&sb->s_umount); + sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; smp_wmb(); + sb_wait_write(sb, SB_FREEZE_PAGEFAULT); + + /* All writers are done so after syncing there won't be dirty data */ sync_filesystem(sb); - sb->s_frozen = SB_FREEZE_TRANS; + /* Now wait for internal filesystem counter */ + sb->s_writers.frozen = SB_FREEZE_FS; smp_wmb(); + sb_wait_write(sb, SB_FREEZE_FS); - sync_blockdev(sb->s_bdev); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } } + /* + * This is just for debugging purposes so that fs can warn if it + * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } @@ -1231,7 +1440,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen == SB_UNFROZEN) { up_write(&sb->s_umount); return -EINVAL; } @@ -1244,16 +1453,15 @@ int thaw_super(struct super_block *sb) if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; up_write(&sb->s_umount); return error; } } out: - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return 0; diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index a4759833d62d..614b2b544880 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -228,6 +228,8 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = 0; if (bb->vm_ops->page_mkwrite) ret = bb->vm_ops->page_mkwrite(vma, vmf); + else + file_update_time(file); sysfs_put_active(attr_sd); return ret; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 15052ff916ec..e562dd43f41f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc( ioend->io_append_trans = tp; /* + * We will pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + /* * We hand off the transaction to the completion thread now, so * clear the flag here. */ @@ -199,6 +205,15 @@ xfs_end_io( struct xfs_inode *ip = XFS_I(ioend->io_inode); int error = 0; + if (ioend->io_append_trans) { + /* + * We've got freeze protection passed with the transaction. + * Tell lockdep about it. + */ + rwsem_acquire_read( + &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + } if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { ioend->io_error = -EIO; goto done; @@ -1425,6 +1440,9 @@ out_trans_cancel: if (ioend->io_append_trans) { current_set_flags_nested(&ioend->io_append_trans->t_pflags, PF_FSTRANS); + rwsem_acquire_read( + &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); xfs_trans_cancel(ioend->io_append_trans, 0); } out_destroy_ioend: diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4559c6e6f2c..56afcdb2377d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -770,10 +770,12 @@ xfs_file_aio_write( if (ocount == 0) return 0; - xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE); + sb_start_write(inode->i_sb); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ret = -EIO; + goto out; + } if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount); @@ -792,6 +794,8 @@ xfs_file_aio_write( ret = err; } +out: + sb_end_write(inode->i_sb); return ret; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f1535d25a9b..0e0232c3b6d9 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -364,9 +364,15 @@ xfs_fssetdm_by_handle( if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(parfilp); + if (error) + return error; + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); return PTR_ERR(dentry); + } if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { error = -XFS_ERROR(EPERM); @@ -382,6 +388,7 @@ xfs_fssetdm_by_handle( fsd.fsd_dmstate); out: + mnt_drop_write_file(parfilp); dput(dentry); return error; } @@ -634,7 +641,11 @@ xfs_ioc_space( if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags); + mnt_drop_write_file(filp); return -error; } @@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr( { struct fsxattr fa; unsigned int mask; + int error; if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; @@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) mask |= FSX_NONBLOCK; - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1196,6 +1213,7 @@ xfs_ioc_setxflags( struct fsxattr fa; unsigned int flags; unsigned int mask; + int error; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -1210,7 +1228,12 @@ xfs_ioc_setxflags( mask |= FSX_NONBLOCK; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); - return -xfs_ioctl_setattr(ip, &fa, mask); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; } STATIC int @@ -1385,8 +1408,13 @@ xfs_file_ioctl( if (copy_from_user(&dmi, arg, sizeof(dmi))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, dmi.fsd_dmstate); + mnt_drop_write_file(filp); return -error; } @@ -1434,7 +1462,11 @@ xfs_file_ioctl( if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } @@ -1463,9 +1495,14 @@ xfs_file_ioctl( if (copy_from_user(&inout, arg, sizeof(inout))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + /* input parameter is passed in resblks field of structure */ in = inout.resblks; error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); if (error) return -error; @@ -1496,7 +1533,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1506,7 +1547,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); return -error; } @@ -1516,7 +1561,11 @@ xfs_file_ioctl( if (copy_from_user(&in, arg, sizeof(in))) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c4f2da0d2bf5..1244274a5674 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -600,7 +600,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_data_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSGROWFSRT_32: { @@ -608,7 +612,11 @@ xfs_file_compat_ioctl( if (xfs_compat_growfs_rt_copyin(&in, arg)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); return -error; } #endif @@ -627,7 +635,11 @@ xfs_file_compat_ioctl( offsetof(struct xfs_swapext, sx_stat)) || xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; error = xfs_swapext(&sxp); + mnt_drop_write_file(filp); return -error; } case XFS_IOC_FSBULKSTAT_32: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 915edf6639f0..973dff6ad935 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -680,9 +680,9 @@ xfs_iomap_write_unwritten( * the same inode that we complete here and might deadlock * on the iolock. */ - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); + sb_start_intwrite(mp->m_super); tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); - tp->t_flags |= XFS_TRANS_RESERVE; + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 711ca51ca3d7..29c2f83d4147 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1551,7 +1551,7 @@ xfs_unmountfs( int xfs_fs_writable(xfs_mount_t *mp) { - return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) || + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8724336a9a08..05a05a7b6119 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -311,9 +311,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ #define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ -#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen) -#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l)) - /* * Flags for xfs_mountfs */ diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index 97304f10e78a..96548176db80 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -403,7 +403,7 @@ xfs_sync_worker( if (!(mp->m_super->s_flags & MS_ACTIVE) && !(mp->m_flags & XFS_MOUNT_RDONLY)) { /* dgc: errors ignored here */ - if (mp->m_super->s_frozen == SB_UNFROZEN && + if (mp->m_super->s_writers.frozen == SB_UNFROZEN && xfs_log_need_covered(mp)) error = xfs_fs_log_dummy(mp); else diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdf324508c5e..06ed520a767f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -576,8 +576,12 @@ xfs_trans_alloc( xfs_mount_t *mp, uint type) { - xfs_wait_for_freeze(mp, SB_FREEZE_TRANS); - return _xfs_trans_alloc(mp, type, KM_SLEEP); + xfs_trans_t *tp; + + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); + tp->t_flags |= XFS_TRANS_FREEZE_PROT; + return tp; } xfs_trans_t * @@ -588,6 +592,7 @@ _xfs_trans_alloc( { xfs_trans_t *tp; + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); tp = kmem_zone_zalloc(xfs_trans_zone, memflags); @@ -611,6 +616,8 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); atomic_dec(&tp->t_mountp->m_active_trans); + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); kmem_zone_free(xfs_trans_zone, tp); } @@ -643,7 +650,11 @@ xfs_trans_dup( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); - ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | + (tp->t_flags & XFS_TRANS_FREEZE_PROT); + /* We gave our writer reference to the new transaction */ + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index bc2afd52a0b7..db056544cbb5 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -179,6 +179,8 @@ struct xfs_log_item_desc { #define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ #define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ #define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ /* * Values for call flags parameter. diff --git a/include/linux/audit.h b/include/linux/audit.h index 22f292a917a3..36abf2aa7e68 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -130,6 +130,7 @@ #define AUDIT_LAST_KERN_ANOM_MSG 1799 #define AUDIT_ANOM_PROMISCUOUS 1700 /* Device changed promiscuous mode */ #define AUDIT_ANOM_ABEND 1701 /* Process ended abnormally */ +#define AUDIT_ANOM_LINK 1702 /* Suspicious use of file links */ #define AUDIT_INTEGRITY_DATA 1800 /* Data integrity verification */ #define AUDIT_INTEGRITY_METADATA 1801 /* Metadata integrity verification */ #define AUDIT_INTEGRITY_STATUS 1802 /* Integrity enable status */ @@ -687,6 +688,8 @@ extern void audit_log_d_path(struct audit_buffer *ab, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); +extern void audit_log_link_denied(const char *operation, + struct path *link); extern void audit_log_lost(const char *message); #ifdef CONFIG_SECURITY extern void audit_log_secctx(struct audit_buffer *ab, u32 secid); @@ -716,6 +719,7 @@ extern int audit_enabled; #define audit_log_untrustedstring(a,s) do { ; } while (0) #define audit_log_d_path(b, p, d) do { ; } while (0) #define audit_log_key(b, k) do { ; } while (0) +#define audit_log_link_denied(o, l) do { ; } while (0) #define audit_log_secctx(b,s) do { ; } while (0) #define audit_enabled 0 #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 4ba5c8715523..38dba16c4176 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -414,6 +414,7 @@ struct inodes_stat_t { #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> +#include <linux/lockdep.h> #include <asm/byteorder.h> @@ -440,6 +441,8 @@ extern unsigned long get_max_files(void); extern int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; +extern int sysctl_protected_symlinks; +extern int sysctl_protected_hardlinks; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, @@ -1445,6 +1448,8 @@ extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); +struct mm_struct; + /* * Umount options */ @@ -1458,6 +1463,31 @@ extern int send_sigurg(struct fown_struct *fown); extern struct list_head super_blocks; extern spinlock_t sb_lock; +/* Possible states of 'frozen' field */ +enum { + SB_UNFROZEN = 0, /* FS is unfrozen */ + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop + * internal threads if needed) */ + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ +}; + +#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) + +struct sb_writers { + /* Counters for counting writers at each level */ + struct percpu_counter counter[SB_FREEZE_LEVELS]; + wait_queue_head_t wait; /* queue for waiting for + writers / faults to finish */ + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* queue for waiting for + sb to be thawed */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map lock_map[SB_FREEZE_LEVELS]; +#endif +}; + struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ @@ -1505,8 +1535,7 @@ struct super_block { struct hlist_node s_instances; struct quota_info s_dquot; /* Diskquota specific options */ - int s_frozen; - wait_queue_head_t s_wait_unfrozen; + struct sb_writers s_writers; char s_id[32]; /* Informational name */ u8 s_uuid[16]; /* UUID */ @@ -1561,14 +1590,117 @@ extern struct timespec current_fs_time(struct super_block *sb); /* * Snapshotting support. */ -enum { - SB_UNFROZEN = 0, - SB_FREEZE_WRITE = 1, - SB_FREEZE_TRANS = 2, -}; -#define vfs_check_frozen(sb, level) \ - wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) +void __sb_end_write(struct super_block *sb, int level); +int __sb_start_write(struct super_block *sb, int level, bool wait); + +/** + * sb_end_write - drop write access to a superblock + * @sb: the super we wrote to + * + * Decrement number of writers to the filesystem. Wake up possible waiters + * wanting to freeze the filesystem. + */ +static inline void sb_end_write(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_WRITE); +} + +/** + * sb_end_pagefault - drop write access to a superblock from a page fault + * @sb: the super we wrote to + * + * Decrement number of processes handling write page fault to the filesystem. + * Wake up possible waiters wanting to freeze the filesystem. + */ +static inline void sb_end_pagefault(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_end_intwrite - drop write access to a superblock for internal fs purposes + * @sb: the super we wrote to + * + * Decrement fs-internal number of writers to the filesystem. Wake up possible + * waiters wanting to freeze the filesystem. + */ +static inline void sb_end_intwrite(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_FS); +} + +/** + * sb_start_write - get write access to a superblock + * @sb: the super we write to + * + * When a process wants to write data or metadata to a file system (i.e. dirty + * a page or an inode), it should embed the operation in a sb_start_write() - + * sb_end_write() pair to get exclusion against file system freezing. This + * function increments number of writers preventing freezing. If the file + * system is already frozen, the function waits until the file system is + * thawed. + * + * Since freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. Generally, + * freeze protection should be the outermost lock. In particular, we have: + * + * sb_start_write + * -> i_mutex (write path, truncate, directory ops, ...) + * -> s_umount (freeze_super, thaw_super) + */ +static inline void sb_start_write(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_WRITE, true); +} + +static inline int sb_start_write_trylock(struct super_block *sb) +{ + return __sb_start_write(sb, SB_FREEZE_WRITE, false); +} + +/** + * sb_start_pagefault - get write access to a superblock from a page fault + * @sb: the super we write to + * + * When a process starts handling write page fault, it should embed the + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get + * exclusion against file system freezing. This is needed since the page fault + * is going to dirty a page. This function increments number of running page + * faults preventing freezing. If the file system is already frozen, the + * function waits until the file system is thawed. + * + * Since page fault freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. It is advised to + * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault + * handling code implies lock dependency: + * + * mmap_sem + * -> sb_start_pagefault + */ +static inline void sb_start_pagefault(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_PAGEFAULT, true); +} + +/* + * sb_start_intwrite - get write access to a superblock for internal fs purposes + * @sb: the super we write to + * + * This is the third level of protection against filesystem freezing. It is + * free for use by a filesystem. The only requirement is that it must rank + * below sb_start_pagefault. + * + * For example filesystem can call sb_start_intwrite() when starting a + * transaction which somewhat eases handling of freezing for internal sources + * of filesystem changes (internal fs threads, discarding preallocation on file + * close, etc.). + */ +static inline void sb_start_intwrite(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_FS, true); +} + extern bool inode_owner_or_capable(const struct inode *inode); @@ -1892,6 +2024,7 @@ struct file_system_type { struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; + struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; @@ -2334,9 +2467,6 @@ static inline void i_readcount_inc(struct inode *inode) } #endif extern int do_pipe_flags(int *, int); -extern struct file *create_read_pipe(struct file *f, int flags); -extern struct file *create_write_pipe(int flags); -extern void free_write_pipe(struct file *); extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern struct file * open_exec(const char *); diff --git a/include/linux/mm.h b/include/linux/mm.h index bd079a1b0fdc..311be906b57d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1441,6 +1441,7 @@ extern void truncate_inode_pages_range(struct address_space *, /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); /* mm/page-writeback.c */ int write_one_page(struct page *page, int wait); diff --git a/include/linux/namei.h b/include/linux/namei.h index d2ef8b34b967..4bf19d8174ed 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -67,6 +67,7 @@ extern int kern_path(const char *, unsigned, struct path *); extern struct dentry *kern_path_create(int, const char *, struct path *, int); extern struct dentry *user_path_create(int, const char __user *, struct path *, int); +extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index ce4743a26015..fa63048fecff 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -143,6 +143,7 @@ typedef struct svc_fh { int fh_maxsize; /* max size for fh_handle */ unsigned char fh_locked; /* inode locked by us */ + unsigned char fh_want_write; /* remount protection taken */ #ifdef CONFIG_NFSD_V3 unsigned char fh_post_saved; /* post-op attrs saved */ diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index e11d1c0fc60f..ad1a427b5267 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -160,4 +160,6 @@ void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); long pipe_fcntl(struct file *, unsigned int, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file); +int create_pipe_files(struct file **, int); + #endif diff --git a/kernel/audit.c b/kernel/audit.c index 4a3f28d2ca65..ea3b7b6191c7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1456,6 +1456,27 @@ void audit_log_key(struct audit_buffer *ab, char *key) } /** + * audit_log_link_denied - report a link restriction denial + * @operation: specific link opreation + * @link: the path that triggered the restriction + */ +void audit_log_link_denied(const char *operation, struct path *link) +{ + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_ANOM_LINK); + audit_log_format(ab, "op=%s action=denied", operation); + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_d_path(ab, " path=", link); + audit_log_format(ab, " dev="); + audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id); + audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino); + audit_log_end(ab); +} + +/** * audit_log_end - end one audit record * @ab: the audit_buffer * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6502d35a25ba..87174ef59161 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1498,6 +1498,24 @@ static struct ctl_table fs_table[] = { #endif #endif { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index f8a3f1a829b8..ba6085d9c741 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -12,7 +12,7 @@ #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); -static DEFINE_MUTEX(percpu_counters_lock); +static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER @@ -123,9 +123,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc->list); - mutex_lock(&percpu_counters_lock); + spin_lock(&percpu_counters_lock); list_add(&fbc->list, &percpu_counters); - mutex_unlock(&percpu_counters_lock); + spin_unlock(&percpu_counters_lock); #endif return 0; } @@ -139,9 +139,9 @@ void percpu_counter_destroy(struct percpu_counter *fbc) debug_percpu_counter_deactivate(fbc); #ifdef CONFIG_HOTPLUG_CPU - mutex_lock(&percpu_counters_lock); + spin_lock(&percpu_counters_lock); list_del(&fbc->list); - mutex_unlock(&percpu_counters_lock); + spin_unlock(&percpu_counters_lock); #endif free_percpu(fbc->counters); fbc->counters = NULL; @@ -170,7 +170,7 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, return NOTIFY_OK; cpu = (unsigned long)hcpu; - mutex_lock(&percpu_counters_lock); + spin_lock(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; unsigned long flags; @@ -181,7 +181,7 @@ static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb, *pcount = 0; raw_spin_unlock_irqrestore(&fbc->lock, flags); } - mutex_unlock(&percpu_counters_lock); + spin_unlock(&percpu_counters_lock); #endif return NOTIFY_OK; } diff --git a/mm/filemap.c b/mm/filemap.c index a4a5260b0279..fa5ca304148e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1712,8 +1712,35 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); +int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + int ret = VM_FAULT_LOCKED; + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + lock_page(page); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + /* + * We mark the page dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty page and writeprotect it again. + */ + set_page_dirty(page); +out: + sb_end_pagefault(inode->i_sb); + return ret; +} +EXPORT_SYMBOL(filemap_page_mkwrite); + const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, + .page_mkwrite = filemap_page_mkwrite, }; /* This is used for a general mmap of a disk file */ @@ -2407,8 +2434,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, count = ocount; pos = *ppos; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; written = 0; @@ -2507,6 +2532,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); + sb_start_write(inode->i_sb); mutex_lock(&inode->i_mutex); blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); @@ -2520,6 +2546,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = err; } blk_finish_plug(&plug); + sb_end_write(inode->i_sb); return ret; } EXPORT_SYMBOL(generic_file_aio_write); diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 213ca1f53409..13e013b1270c 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -304,6 +304,7 @@ out: static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, + .page_mkwrite = filemap_page_mkwrite, }; int xip_file_mmap(struct file * file, struct vm_area_struct * vma) @@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, loff_t pos; ssize_t ret; + sb_start_write(inode->i_sb); + mutex_lock(&inode->i_mutex); if (!access_ok(VERIFY_READ, buf, len)) { @@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, pos = *ppos; count = len; - vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -436,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, current->backing_dev_info = NULL; out_up: mutex_unlock(&inode->i_mutex); + sb_end_write(inode->i_sb); return ret; } EXPORT_SYMBOL_GPL(xip_file_write); diff --git a/mm/memory.c b/mm/memory.c index 482f089765ff..57361708d1a5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2650,6 +2650,9 @@ reuse: if (!page_mkwrite) { wait_on_page_locked(dirty_page); set_page_dirty_balance(dirty_page, page_mkwrite); + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); } put_page(dirty_page); if (page_mkwrite) { @@ -2667,10 +2670,6 @@ reuse: } } - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); - return ret; } @@ -3339,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (dirty_page) { struct address_space *mapping = page->mapping; + int dirtied = 0; if (set_page_dirty(dirty_page)) - page_mkwrite = 1; + dirtied = 1; unlock_page(dirty_page); put_page(dirty_page); - if (page_mkwrite && mapping) { + if ((dirtied || page_mkwrite) && mapping) { /* * Some device drivers do not set page.mapping but still * dirty their pages @@ -3353,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* file_update_time outside page_lock */ - if (vma->vm_file) + if (vma->vm_file && !page_mkwrite) file_update_time(vma->vm_file); } else { unlock_page(vmf.page); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 79981d97bc9c..e4768c180da2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -823,6 +823,34 @@ fail: return NULL; } +static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +{ + struct dentry *dentry; + struct path path; + int err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + return err; + + /* + * All right, let's create it. + */ + err = security_path_mknod(&path, dentry, mode, 0); + if (!err) { + err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); + if (!err) { + res->mnt = mntget(path.mnt); + res->dentry = dget(dentry); + } + } + done_path_create(&path, dentry); + return err; +} static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { @@ -831,8 +859,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - struct dentry *dentry = NULL; - struct path path; int err; unsigned int hash; struct unix_address *addr; @@ -869,43 +895,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) atomic_set(&addr->refcnt, 1); if (sun_path[0]) { - umode_t mode; - err = 0; - /* - * Get the parent directory, calculate the hash for last - * component. - */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_mknod_parent; - - /* - * All right, let's create it. - */ - mode = S_IFSOCK | + struct path path; + umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = mnt_want_write(path.mnt); - if (err) - goto out_mknod_dput; - err = security_path_mknod(&path, dentry, mode, 0); - if (err) - goto out_mknod_drop_write; - err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); -out_mknod_drop_write: - mnt_drop_write(path.mnt); - if (err) - goto out_mknod_dput; - mutex_unlock(&path.dentry->d_inode->i_mutex); - dput(path.dentry); - path.dentry = dentry; - + err = unix_mknod(sun_path, mode, &path); + if (err) { + if (err == -EEXIST) + err = -EADDRINUSE; + unix_release_addr(addr); + goto out_up; + } addr->hash = UNIX_HASH_SIZE; - } - - spin_lock(&unix_table_lock); - - if (!sun_path[0]) { + hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1); + spin_lock(&unix_table_lock); + u->path = path; + list = &unix_socket_table[hash]; + } else { + spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { @@ -914,9 +920,6 @@ out_mknod_drop_write: } list = &unix_socket_table[addr->hash]; - } else { - list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; - u->path = path; } err = 0; @@ -930,16 +933,6 @@ out_up: mutex_unlock(&u->readlock); out: return err; - -out_mknod_dput: - dput(dentry); - mutex_unlock(&path.dentry->d_inode->i_mutex); - path_put(&path); -out_mknod_parent: - if (err == -EEXIST) - err = -EADDRINUSE; - unix_release_addr(addr); - goto out_up; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) diff --git a/sound/sound_firmware.c b/sound/sound_firmware.c index 7e96249536b4..37711a5d0d6b 100644 --- a/sound/sound_firmware.c +++ b/sound/sound_firmware.c @@ -23,14 +23,14 @@ static int do_mod_firmware_load(const char *fn, char **fp) if (l <= 0 || l > 131072) { printk(KERN_INFO "Invalid firmware '%s'\n", fn); - filp_close(filp, current->files); + filp_close(filp, NULL); return 0; } dp = vmalloc(l); if (dp == NULL) { printk(KERN_INFO "Out of memory loading '%s'.\n", fn); - filp_close(filp, current->files); + filp_close(filp, NULL); return 0; } pos = 0; @@ -38,10 +38,10 @@ static int do_mod_firmware_load(const char *fn, char **fp) { printk(KERN_INFO "Failed to read '%s'.\n", fn); vfree(dp); - filp_close(filp, current->files); + filp_close(filp, NULL); return 0; } - filp_close(filp, current->files); + filp_close(filp, NULL); *fp = dp; return (int) l; } |